/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#pragma once

#include <flashinfer/trtllm/batched_gemm/trtllmGen_bmm_export/BatchedGemmOptions.h>


namespace batchedGemm { 


namespace tensorrt_llm
{

namespace kernels
{
// clang-format off

#define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0"

static constexpr size_t tllmGenBatchedGemmListLen = 838;


static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
#ifndef EXCLUDE_SM_100
{nullptr, 0, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "f5e989ba85f0caefa74ac2c8e6068f2f60c7805444f55f43fc10873573f96cd4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "143ed7bd4284d12b4912b1abcaaefd35e824124ac65df63ddc9384dfa75e81c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "bad85a297c76276bdb4dcf22665e2583bc9d01d5bf37f29c3f4bdb5826178d5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "c1bc3670255fbd30b894b49acd4295da4ef6c7b96e97db0966cb7c6a555e3260", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "6b414fe4363f1bc1e05e8c105ac3f221f115fa222a9ee8301d7c921bc1804046", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163232, "bmm_Fp16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "8b0a7c32d73372ca19b0c7b93b201a95acc147be19ece2983de994828eeffdcc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "0286178f645c28e12adb3d5ca428aea203cdaf6d3766eab9170af5dfda400ee0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 162208, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "1844f389c4b7422c7694c1b1f8604531875d889b4512f5dfe4265bafb00ed084", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "479ed5fbe2f17b2d428ccbbedf355915ced5aa9f6b3d815f14b903a4ff7288cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "c8ab0ec75522aa9b65b226cf651bc9b2606a86ce4725852600f4293711f5bbd2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "94f6b4846df1652e73de199efc0ecd0e95a4ac7f44f48543c59d6dc916b3999d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "e3b46566f9f6fe046475475b7ee0a948408eeb6fcc4af8b17a8ef265cb36f0a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "e01f9d3190b4f02951fd5b7f6be941558c7eb2c5f433cc4c3aabe43cfbddecac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "c8d4aaec0103862fc95fa9cbdfc874dff9db57bdee471bbd3b1ffb7d959792cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "89d4e4dfaffb84eb48f1b918e37fc071544d8fcf7dcbc90ed43b28413b2e6880", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "0044e1fc69ca77f84f5d8b40c409a085bc7a4cd2f023c8d8ef2164145f50b5e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "e3f55180630839e7ee0fc35c9a6d32e3a1d711e0d8c8565a0d07d78145eb6899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_relu2_bN_tmaOpt_clmp_sm100f", 256, "9839377e07e9a67eff5eb10eb00a8cb294fc9f14bcc0775fea333f5312b15c43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "242f8283198674c96f69a3dbef21d55b294c0c22199bec7c459624bb0aac9360", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "8f72f02c633c45eb44f4bb4de78df060b63b11adeaa34ba42bbd391decb06e5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "782f6ed7bfffa1a53771dfef8338efc08b1dae64053b746d9cc5b0fa0f507d73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "ca829ecf8f96c05ff4bdacacb7ca0c0f48a144fc38f0a27bd2ac89d36027b21c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "f0ae027b8f67f34f7da2ed883ca2e1e530f71f68141f91409a1aa518cd113715", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "c0de9d0288bce7a588dfdab68d98d767393cabed35d3ab4717ce25b000e901c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "a789fcbb5a038281d1d3b5ed483ad70eec943804fce50fe05d74845023918ba5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "78a620b65a304489f9dc5af0b618049f12842fc233010a3e0605f653d7c67126", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "b55ede024fa0e0ced4d130cf1c7219e8ec9e0073a9ec28379ca8a6b5c06ee378", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "cb8d8a42509f0356e0bd83de446b5826e969812fef57de7baf2bf64107de605e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "54080a6aac9b0de2722610c90a9b9b5a85f5c37c62c31d664197bf1032ad2710", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "cc782fbabefd58d0b1aac4695c88f4deb61aea66f5608e2cd7f7cb187a1e8ce7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "107aa762ef590409bff2dcbd5858f35182337e5ced22a9faa112cb2132ab6c2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "664def5793980a8acbd8c263c83672617aa4d12750860cfdb20bae4eed051290", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "7903feb6f73cc58f776b277fb8f066e01439dcc7e99202773c4ef7bac48b7e81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "629a4891bfd8e992eebb80ac83eb1bc80cc93fab0993517cae188a32e8bfc0e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "b3d3008c0893c816b27ba2eb7860cfe74fb29e6dec19b0e800983b24dae318db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "f65c634d902d3a96a42415ff89ef6ede47f28f45ef445bd752c2e9415aaa0ec4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "5819715e5429fe89cfd043712d4f7fb10cc7e5e685f8adf57a10d930c625f24b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "fd43b7f7e2536502d16e44480a86bb97e73bb81bb8b86e67c9263513e3f8f688", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "c16a4fbfd96c111dfdee30caed26365e8a3a462c29cefc17b63f2b165654d61e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "114446f4f602d39e2cf62e5f33e8aea6ba188897543631324164f1156dd43894", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "b2f3cbf8436e574d68b83930fbd9b76724daf2e4e7a232f05f374d4b9b3f2615", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "eff443aedfce03c2630136240b415c59a04f287327c8640332cbc25053e80715", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 1
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 2
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 0
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bf18b767bc01dc282ac3487a347fce50ec9525b79b325472af57629a0e62d255", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "277a441476916d620720757b8adf3f4b0d90c6bb08f40a3d84e2cbea387c4250", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5fab8e91f03dc124f4b66ebbe8a740ffc47be111ec4c1e0bf9b0b1ce1512f465", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2b45bae4a927734bdd1dd9f0af1f45c7b1f4ef9cd6950bfc4943e82cc029ea16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bf0766e799870b5c9e1cdfdeb1612bff4e242f242a990c812e29806fd19734fc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ee57bf77bbb24319667e9e2943ac988e8697fbb3c4a46d66ce0965b983a5233f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e8b2e15870d10c5cc3b06b0e2a14b77c2679fb28b324be51b3c7e078b26ed645", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "69bb330161e75a7aad7774f241857e7161bbd4e8750d9a057928a02c096d4444", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c803cde938b080676cfe12065cd785e67812a2d7d4cb5c396add170b0c7c2c20", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "34bc6870d9927cf86bee910095ba91fc962c40bb92537a7dfd9b9ffd59df4767", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ff9af0896a5da9e52745b7f3e9392edcfe11ae2c52aaf2da17b1782f1f82818d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "012b232ef7c2027cece612efc6dbce1b425b3a06beaac6456a12039a2196a3d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "ba40fa265f0f4fbe18cd78e1ba979e9d132c4edd597e16dd08f795e3c6425989", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7d58e5794a809067a546a30b62b5eecdd8a684553079aa70d93a4cfe587e5501", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "cd1f8ed1438d7cafa305b443e2c11afba5f5b626e590f4569ee8512de044d36d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "91f759990ab3b99cd6b2ded0d073b609ff02b3a0b05a16be006ad55a4264ecf8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "9ff8f2feb6d92020f3590d74db605c9b4e9a534ca753678ac98a05fe18ccf04d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "91b810bb725a9e989fd8d9ab19ff49d5ffd0bb1fd5d60422f74b26b55346c62f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "6a518d8d28e969c4c0cfe4d676741b58b114013db6f593a6c5ed2ea08d9ddf49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "79e93195b1a5bb1bb95f831aaef47a0843652ed43ffb054d0173dd641f79a221", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "4c2dfd3b83c360ffc6c2a49f649182d4b31334fc0b848c32901299aa396194e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "a428adb796e5084426f27f8fbb84a4bbd90383e4c3955b3dde50a90dd8f66a0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "5b24e126b66fc35c14f1e572b049033c3a29181cb81c88ac4c87877a7948b64f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "aa770752c804af5200a83bea534ecddf669d5deeefa03e75512d5d695b2d4d8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "764791f1a3e3a59d498d40e914620fd774806beaa6c0130cff6f0cf02831f74f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "2c9621618abdbfcf69e5d519798c53e67cac844d534f8cdfc7f21e0360ae8288", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "57c33472da35104421a2203fee806e3ffccf4fc3edb8980ca9ec8a6b762cb9ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "67c7bb1a586a5cca43e638445161ca35aabdbe9b1ea77b223aefb130f8e76202", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "cab63993867cf0448764d0298b92c51bb2e623a555c7f4c728b6bcc26af7cbb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "98e7e25b99c0ae90d4e926d42f31e7fc6ac5c2d54206ef1ee3392e18caf9701a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202240, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "207cd20ece321d757f6860e4af6ee61854ca89b2654fb1bfac8c2ec4b7e9b4e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 202400, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "d2a25974e83345ddeb018b0779891036b1bef941c77d74b46c45e0d26ba3faed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213504, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "ee257744d6de2fee6f3e046cb95b24e773de3a92d1ccd4427d261100b239fa4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213664, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "709a7656478ac6a0fdf1639184f6fadfcfd4c7bb4a62a66ba8c716d0a5d3fd71", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190880, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "e0dbdfb899a7700441dc4d14af0ff537a26a793651686c541f0fb79ad56d918e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 209576, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "657bffa22a56215fdd15f98ac56c839856657c77600aaa8b5abf12e703ec73a6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191040, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "4937d36ee75954049bf4495b34931694e2ed6a7bff6160e0895003f9fead5b2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "b3d5cb1258b2f4418292ffa4f2905513b1e81ae4fcb0161e775151a2d805bbe3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1536
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1536
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217768, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c1cf8b70242c82b5f63387cf8937950c599b4f2ee20c6ce6077f715c18429524", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 4
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 4
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 195144, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "90a1829356c76a51253525dedc3014037854ab3481a62cb076cb6c1547b7d59f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1536
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1536
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203336, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "99d26e9abc651aeb380a5c265829bc49944b447133a1b62dd8d3f7c369b68e27", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 4
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 4
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 209576, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "dc35fb7e6f62963bff4470ce6f0d3eb2c0892b5c104c0d9ef978620600eb6979", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 175592, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "d58ea0d55888f18282c225a9693e32ec49fa14682ed92603e62fc41d96a84b1f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213672, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "8318e7c4c00d51eaf6951551d092095cf62c8b112fd4c2631400ab10fce8f0e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1536
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1536
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217768, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "278015b2b87861bbb8ca40d110a80f4ad87964b1c0ff94362bd8b8f22761684b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 4
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 4
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 195144, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "15bbd2407e19ee64c1cd4154da4a81e05e02d96668fedc2b71ff0fc8ce964102", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1536
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1536
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203336, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "caf31f07da6650ed4fa2fbeb8554c00047abbfb1258f336cf7fa5f7face8ef9a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 4
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 4
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 175592, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_ldgsts_tmaOpt_clmp_dynBatch_sm100f", 512, "892aa821137de927abad0116bf4124fee100026f82cb564baa7066d3e553d322", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "2f30a87d00b53cb4f594634907ecaae7d3d8c3c089f161d3b9a46df6cdf57f6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "86d18447a764fa68c50cd5ac06d95e45b0a2890d3e056938b7c5af051b76b92f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "9ca19d82902498212f600b3f1e5ea0eba36de2fa5f5898963ce232140699c5da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "6c8db1d9116d82df46a0469e9861727a00bf24a88f10f03d97d13880fa2d0050", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "81d8ab9ec57a0af556cec1c2bbe3d76c9ed030e18f4a792abbc580ff5d89fdf2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "d34a30f39af59c66ec40d75d9973802e5e9868f33282e109f116af3aca257b9d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "1d6191ef83e3006ab13abbfa96a45b43289c1b4394414bd26bbb19aed53ad97c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "64059edfafb462d2f80b30288a285472297e7e1f618115ce5cada4bf40a49cfd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "ec5e8b23279f8b213814373917ffd94523e4fd20482669708cfca84fc99930ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "0caf2c45f6eb3bb9d7616ed3d46152ed0e8921069dc967381420b75bdd2f3815", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "2a28378b8db4cfe74a8c8712de30d276c55ddadef0796096fa1df3b82f335e86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "79f11bbc4dea7cffc7a551ac9aa4d84a800f3039aaa6d20933e948c6a5f2f6a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "9a088ae1a2e3d103b73232ff7fdcc30f36239c9accc58537d06a56ce65365061", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "36047a35580eea5abfe00e3e84a707097d3c964b5ef408a5a6d4cece1fbe7e49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "11247b59b7c24ccd0cc9491ea4a2a78f45a6826adf0bdd4f2282dc029a133cef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196168, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "9ea0bb6571902da410734d75e65551e5f360a852e10d3f261d39c1fcbafcc290", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157448, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "443b0e2c3c8e98c7ff80b1108ae75c20c91d1f4f5858ad18eaf3e09fa0222d89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0679c09a4a52ca72d49e8a7edcd5863cc00e28086f2b4c50e7585460c42ffd7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193288, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 896, "9a7f96d3a59c841fcfda3875d484c355901486e3ea58fc98309067899e4502c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172536, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "92d0ccd78ea006cfcc666ba3e2b4ec2d7d0c7a6e52f008fc313c2c38e709a92c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 88
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 32
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 40
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172536, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "324cc8d6dd6cafd1dcf7be13342c1529ebf39288fdc92cb82e8ca4ee9e8e7028", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 88
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(1)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 32
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 40
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172536, "bmm_E2m1_E2m1E2m1_Fp32_bA16_bB16_bC16_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_relu2_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_lbW8_lsfbW4_dynBatch_sm100f", 768, "b934cd2d39755fb968e6f28f10d1d78655165546f23395317700116f31ea3c6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 88
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ 16
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 32
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 40
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "34a5755e3bc4ca5c8dfa8757bbbb38e898fa9a1737f363983a2d19e046325772", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 216576, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "223c41656552226f3e5c31ef61ad2da44248dbca3208c824bc937f232015066b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 216736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512u2_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c0ed36ae7fae75af1923ac43cd299b1eaa75d20e339cc678c6ca66505cdc8565", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197024, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "35f2c850213b6ea5252866bf0e79122ac7b952dc380118a75ad0e4b87bd817a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197184, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ce382082b77983b46acc0e818a9e1777e3227130ee0a8d3de7185f71c4fbac2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187712, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9676cba268ed8cef70c735da3494eb481f976e00aa36b71cff48a94ba1d08f44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124512, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4b702162692e811060dc250bc730aa19f3deb96d4d95b7108253595cd568ead4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187872, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "bd66a078ba97f742a41d9f4006d773543af58e7fabd13c284ba8da09537d0d4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124672, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "15af1d8501e34cf8f6584e7ad64671502aaa75984c25c3e5878b95cb4905e040", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f8e7f5377ef8d5d1c0c95774510525ee567d42a05484b2368f3771947216390c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133888, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "048fcab3bf4e5a4d1918477bae1e66bcd0732187497396b808be63c7b26fc446", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151136, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7bd2d8d499d83bfb356c3392944c87239b52e523bde05a49893bf8fb3bb87632", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "cca64bd7525ad976f3911bb452bbac52293d2a4d6bd004d482ca01e265a9689c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b9f5f148f292d4490ccedb98f0c70b3ae5f5f465877b298bc48faa626379a635", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ae0d2205cd37572635429d9b190dda3393e7b65a96db8c626f9ad96f88920922", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 216576, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d586371d6c6c8f1fdf22d5dace1586baffd45e55d4d80d40a98326223cc70822", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197024, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a812277623e1d2c27bcaa226357d8603109450b3a84abf4d702c5f10cfc136a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 216736, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x512_s5_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5b5ebd58b5ce25c63e7dfe1f90da09a3df01beb15c7e87e25e7495a66f9889d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197184, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b5480dfb9063a649de1ef96eee0cc2487b9bb57c7378749954c4174eb15c9eb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187712, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7e5adf47d761abbc604aa49ce7d4b2a791e4884bdbc4cedf71f40b898c94a722", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124512, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9082f46e855a4677e9942ba5d468e347e8a87d1cf04ad8bbd7fbc4c94fb1b400", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187872, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "10616c4834c5939b6d4d0f237881489d7c35ce7ab500dc86dc39b453ea9c1a96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133728, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "deedac80a8b1f1a5f8e02b68425489b58299e8d8339efd23f1cfb7ef9fc5e85c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124672, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b4312e036ba299b050dbf38027e3efcbdcd6f77ec6447043db2e7ae7aadd36d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151136, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "520ac953a56d56a85aa8623b6b51923f32c064451b11f30d55ef7c574cd60875", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133888, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "662fb1dcee4823d7a26008abe4eeefc690821c183dcbb871b5b57787562409ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151296, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "93268e45844442e26e6ff25cb239cf196658fdee731f6f825cfc07060866e9dc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 209560, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "02f635e97f9872b29d0edc052cb7bfe0c9f1c59fb8f1d97618c5584b4075014c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213656, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "896aad9bc320f5a1c51819d51eeaa41f394f95b3820d19a659295acdd3c2ed17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1536
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1536
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217752, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "19727a05c0fe6873dd25f5be321835f551faf97b825bba4ea13b3d12ecb4876a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 4
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 4
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7f547e7d8fda04b782c6e0e46a44c3a50c0d71aa02dd2add5124478daf6f157a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "92305a947867238c11f0084b609a01b17bd669423e51843314c224229ab5d3a6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "78921afbfac2ca1cdf47ce568fb239d2d4d624a85aa78318c9ce669ef1c77679", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "8aa2acaa66f9bfa9536a3e308fd15b9326bb306224cb921e83b19c279818f989", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "00f737441a17dbe3806ee7a39798dfde19afdfa83744155a3528093c95b221ba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "2b93379924a6fc39db12e5fbcedef1fa07a5f62a4d3b44fe076a832dca84b50c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 219720, "bmm_Bfloat16_E2m1E2m1_Fp32_bA16_bB16_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "45199cb6225449d1661cdd7e5addf12d230f3d853f22cca8624cb21bd576b9c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 16
, /* mSfBlockSizeB */ 16
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "66fdbb079f6f4947ef5604a969f618a12658c063601175ac08adac5443609ae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "49f16d4115065eb4f420f30011e6f65c6a2c1a62983a55ffa7c7a90fb2e51cb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "5df1ef3b4ce5d9a77997f4512e264f0c430f729d5078f663dcb99ffba3b8fd34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "e8e99a06b8912c7125b3c18b4d8770eab32631a44c48c908cc36ea64aeff6fd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "7b2b42cff1c8674247f9eba0a8f530f29cf0dcab920dcf1f0de7b8ea7bf0db0b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "7ed10968b349a476dbde86ad139be5b091c8ff4ddb415532f58003b10d0d0011", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "99ec386bb606a7c1a1decf0fd9742fa8736300424c05ea6c152e23596db766ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "35c52c2a47986ccd2611215901df8b99285a2854e2e07bf5e3833e56a6a6cefc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "c5ce54de732c3a9290e115affee33b9c7f1bfea270652d3a21737fb2b05cd144", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "36069c4db6f47ed5afd2281c8488c464dc17d472ec4b04683ea372e5029a10be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "18bac795361ea6dd5f03519515fac963cb1b172dc4620cab5ef10c3185cbd002", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4046d1fd58754599f36e23d38de349787a6a8f416dd259f2b039fa6c92d577eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "13e2dc4b0da2104b1feaa773179c694fb652c219b2b82b8a4f2f816da5cbe009", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "94b96207bb8184c0122a000c3538d6574a6e5efe9dc33844e2e9a3befa7ebadd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "ef56cee45fe1124e8f2f52873f4b4e0adabc0510ee8d272a86df7ead80e18cfb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "8dfd919a928b0c798b46e3c9d089aea7a68abab3e0f4bec607d4e46a3e86079e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "c0cf502a5aff149bb60620df50a8dbc924a37a0d3b7e835c6efcf4c3a6ba9fa2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "19ebdc879b656e74c5e20ddfaf869e100c698f8efd8ff3a37ff8536157850eb2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "b969776e84fd47c0b31c58265044a1f8f47303ecb892661acc4e27c72953e6c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "08a0f94b5994a370bdfc5e2875d0c044e5aa5c381e2c9c9d0c3db079643cae4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "e09d6d34d55875a56a5007c2dd5af2b454f3ff94e8d8774556a0b346111b925d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "9a47d42e8983f95e164838ebdc5ee0b1606b4a4786d4ac1e5c964023bf958653", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4e34177e50c7c9e716632bb34217ed9cb770f5b0e5631a3b5d2e79665d5aaba9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "ca88150d9280f2629b14c2ded602ffa8493956d5a721fa5984b2c6dca2958c28", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "8dbedf54037906ab2aad05056b22af53f86fc327573fa45ffd2c507f4d308cc0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4f7e15d895ea2ba93fb4ae605bb9a34cc3a6a2cbc37fb2eee8b94078547ed972", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "adf9559a4d481915b5b95d6bca14d0c605ff3061b9bd4075e7fc92e60c757fa3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "294f24ed28787f010b777eaf599151069ca77348cb556d23ee8babccae316181", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "c29d99da88593f95baffe8e99c7e63943fc369f3cdd5baba2bc26fce26031885", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "1c45597e36da9856b91b9b96bb43feefd32c7f0d0b9a81dc36470f19081138f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "5143f6b3e9b416c89a2ffc075d886a9b0afc951260c65b12d9fd878381d9996b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "9025f3b42d0eb0b9d418d6c6d4f550f1ecbbc888336e69b5b9f16850066cfb4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "daa389672f53e8c797e4e0a2edf1f84792943feee9221b867d0ddaf6b77aa24b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4a5989f87e08b61885ef585c09ccfd964f5b15920c4bc9773c61bcbb2227292c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "c92cb4422f9081dd34c3c99e3a5455d7d6e5e5f92dc36daee392d8941b3cd8f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "62ada6c18743814ef6b2e61be584fc92dff08ce933e4fe3acf51701917ae4c10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "2cbe93c3eafa05e730e8c3e7ac3a22f1f04e921482c9876de4284cdd256926be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "2b7414ee2b820be628f5ad040b76b745518378090763d8a3a16165d42579d6e0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "8fa2e28a6ee6e409af8bac43969d518f1b07082a24d05c125ff4e6ac1b2d20f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "0cae80c2e8ae9d3fd79875e4f7deac49dac9e0249e32eec41af311fa7bead8c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "ab0c8ff62c545df6b1d6ff6a775e0fdb354f6d4171326478b226c6386579d4cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4f08b32a8abc5a7f709e2a2a745a919a30bad67d856ac6d08667e1fe358bc2de", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "8bf571a4cc5ebdbbbed9f06791ef73f3d256acceb5509ad2bfc1fa960781ef4a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "451d58be4a34bc187cb477d893abf4c7c8caf8188c816e80e31133d7bf762fd4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "bea19b5d8d4a4682a2fa973f767a1f0275406c0950dc351bbd4531ab91e5d65c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "69bcccd373737727094a84ab0ae20f6a3bdbae70bcdceda0384c60ddda4876e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "5b0f78278b5ee146f76827e6fb07fcfdc590a2f5fcacc9879639443821a2949a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4116f0e5650ffe85df6a2bfd6b322cc6584905aaf914b725694c5eebcdbc19c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "58cedf77fee5d2a0d166f2234af521644b669e9c657d3460ee5d89007d82df76", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "b49e45109ff7854d04accbc28f04f188aaa3cb18d104324731e22c195a2e65ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "1ac0c8b926f9b33aee2aa21f797ff7b85f42ce1c6edcb456494981088e05e7b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "821f030df36bbd2908c15da7a05912505faf86ff144845222ceb6f84ec34e094", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "05a9b72f3ddc5c686f3361487ddb5ba202e9325737298ba9e3060e1e6e93e38a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "5f8bacb369df69d5b314983604bc8ac747bb8f49de2d9a3589c46121e082f6ee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "2b4c02520db73702e86543f78b79a754e2237756ae0f574ed7d85e13acd10e8b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "a858a49123b11fcb1a56b859a99c1f0e36e477e0570322b8abbae0eb0dd0f005", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "fbba7d87fdd79bcefd1bc20efccf7565686ea593980d2ac4de481583d706abb9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "a3a84a119ffa2243ae4a1c8a2c24b9d78141fccec40100306aa20a07054096f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "c390a38c46161879980c7b5f4bb4877df25299b47562a7b720c5278f38739805", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "ce98af0924afcb6b83b1ffa9a814e89eb2f3d67c05dad09524dd31c7f30b968c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "3e69af4af3cd3726a58728364e9c6e8ed8f28062079749fea0bc561fcebf44b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "5669ce82851667700adf98e521065ac09f25ad26c23d020c197aa7e384e14f6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 2
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "1e29f03f037ab994cf292e3f42d05a5417a3df0986baa30ce21269301c76e701", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "ccf5da996f0f4a94c693378cd74db34bd800da778997eac478502fcd697b02eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "f6a2cea03b2aaeef87157e18f9dd98790bbef1043b01a6455e1b7969d1382b99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "35d44ff2a16696446d03930719325205343de17707b62f07cd6af04deb49bd2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "8f97428d6090113813d29c13b0c72f7af1b2e7dbea0861fd9ecfb75bf50aa921", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "0d19836e15dfbd7b67eb3f107cb32cd43479ea96c0d2cc8c241741c85e654321", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "7214e71d679dcd2c30ca22b4f02cf37ed38c220660ee784fe7991d1081ae7b7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "678550aada8c6b27f662e9dc5bda34c0208dba1eaceab2229f7381df029403ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4e83c183f49fd1f6eaa99bda20a23cacfab4b79b965462760a335cd0e66513bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "068b4b83af9d7c83a2a2da263686bb39b75d2b8a95261df251fb28cbdbbb33c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "3c073070b65f3d1c7ea5f21f1d589cabea63a486be263497c6ef58d1fe145c51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "17c13fd91f19a89c21e5d69529a0a554c1e104a147592353b38590a2e544ea55", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "73f6c2b872bb95f255abb7a4ef4d84370c2ea3a6aae02719d974f650f3f5f31d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fd90db2ff1ae333c1ad1deb828a34c26e4d5c91bca0ffe4e70ded782281e3507", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f8b97f0819c62dfc351f2d1ea658d34b3bf31ffa248a4a911dd7e06f018930aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "943334c475a9829c4795179d230fb63faa2b29293354b08275bf04b147b86eb8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "d2e6c0e3b6b583b286d7bc15421a11154de5606d812a246513f4489a715056bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "b4621324b4707fc50a9595ed82a9d67f6ce47d9fe3ad388457edaff72c19576c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "2850fac438d15f91ed03d4990feddcbb0f2f279710f248e801b2b93009eabe77", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 80
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 4
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "381e449fe4f46f16280d6cc10568936c7162d4042a8142438526241b0d6a8381", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7f90dc717b6c9b32e2acb8454b2c45cd338f209f5a8ab7d677b927384c28dfb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4f2e2228c75f8072b7fa055a856550f2a87ec9718260510ba7a19d66db1634b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4162266ddad550a18a80050aa5698bbcd79d31d1cd7343c07ee3970ce102be1f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3a69be09181680d7d20b3df0146625308615a51c04a321f25e5833c92253bf2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4468057e70c9a514e2e47f89bf7bc06ec84b9bf64d94d9f98639d2900f8323b6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "69f6480845eb0dd5bdfe2a440da9b9d27d1e710971d781b6a444c45da6702cf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "5900044456f548e16f280b9b7bfbe4995843b11d232280facc7251e92df00fdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bc25236380600c530f546f6502d8757a2a67f5e1805489c29062545902d2fa96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "af86bbb3401a0d55d4e7425c339f16b192397812b86d1350b50f1c265ebcf93b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6cb2505305a96453310e45fc21be1eca5ebba20b4326af54cdaba190d31dc9a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4ab6fb8b7610550d7609e8a7d7d6b5685fb5e27cc4a018f0e3cdfe0b2580e722", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c917397821da11effb748db60671ddba6f45cd98e15c9e2fada3f172481fc9a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c0dedcca76bb4dc5292a25b69198e13fd65d7fea7469b7c2399b7603200c042a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fc5a22e480e22b453fa21a4bfd512d68e704d9766015c7ac90175ad2b05ca425", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "6f790a5b3e2eacece083591a07e11a689b7c0f1254202c84288a10c7b74a399c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fa684996c165fa3173f39b0bd3c7b9c0a856af248ab7dd871449a43d3286cc49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "cf43add323cc886875899cb4f05f6cce38ce21b7d01f1b4435715fd9adf5bcd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "10cb4fa01fd305724dc880571aff80bc16cfdc068af2dab8dfe9593ae4773a35", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2248f4d784e88816efd59d3c5c060aaac1b0b1c3dd661791b92e9779d22a9fc4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "48fd565630c35b34452a6d54bbe1432289e1aeaba6c1df9d3b23b9670c74cb98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "3c4277d2ece841276172b2663836cb7cde6e8282a2568526348bef7ddc88332d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f8b6fab01a79c4b4e82d9960fba538dc72e32534d84a5f8c4afb4f36129df027", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a653e2e0ab4f9254edc9a89b9cb73cd4f9717cd27fe1d449bf1ce215bf4c7ce5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "cc92463de6ec64006a35111b7102a36f2301264a33a80d5e361ffdb82f2c591c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c0e69aa39af330ceef79730ba5b8937ce9bf309ab62aec570b7ecf275f5c08cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2dc5533aaff2ea3328fe86e278166639ff60afcd31124b078e4a8636d15c2856", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6d0fed23c1abe55bcc0134a2f90aa7b4ca9e7c8b21fa786d6066252e59396e2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5d3359d5cebf78f877d57bc27f230b70c77864e9623f30769f2a1bcf1962aa2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "95a0ac9625f4c3ce488955c8ed21eb1d8ed3082ab53ee1059f6e15319f6ff7f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "8ab68cb4f9f68865c2bcc00da60692882a001c90d1508a20d0707da551bae137", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1e00fd18418af4bcc2938c109c78c4cacdb9bd16a836894d884de9f10be7af81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1d12975cfc62cbfe71998bbe98a44cd3d5f38a4de8c9291a8e0bb7be1389500d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "df952a7c3fe1bc32d7e02abfad69a279cf128ce4c03086cea7afc05a64f3ddc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2502a041f9a8a9e6f4a9ee033243ccc0ec351542fb6bb9dfa9584857fcf716a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a009ce210f87a9b87fdc67c775bd0c2e962d2f2f84355900e23c61c21a3dbbea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "33c67f693341fcbf75f2e20dab0a57d0d8ee5c7184d1a8eec91b06415745fe2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "75e2026a937f55864936d9c6dd69f2cc2261b3046b422458a65ed3c413c0aa91", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "93ebf95f184063a040e3d9b428219de19247e807fd356ec5be8ebe0ec0d5a5ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e1492057f502a8931f4e1eb0159adefbcf9ad7f42ae1c9ae8d53eb2614295ddb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "bdd10daacd283aa9f6c4004151fc47a9aa866c7884668f5008af457ceb4d84e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8433f666895921f4092cff0fbb09b496e18f6b839c3b1e17f2b15b377b21dddc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "947c961620f698d4d49e01c41308997c4cd17612b3d592372d4a68721e037ca0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1c75f1de3c7cfe233144503f399390c35ac344ba33a16d9b785956cff2862afa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "41e06b10c7339cdb8159d971689cd2dab33c93215d464bae1beab545fe2a4578", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "171d852f85e37ee42b519490113d622b2be610a5f3eaf4f926b359e4d75fa21f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "28a6177ea333ad16e3f08c71c0245856a9cf6ae249a137d4121b224a5ca4b656", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "cd2eed122e8e46dd5bb67b9f718021c9c51ad27778d64d40ab5fa66755f0da9f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "014e06a2ce5d05c4ef7e4b00d282913ba90c0c3771d1ff62155440122cd4be5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "165932c1f248db4e15a72ba0af322fb9c45a2f4f88034e813c57d65b32695db8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e59a545b8f0e63570d94d1a148f5ababb116bbd854520f9c6a3441726230465d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d3c694c7f3904f8ea1180729739a9884d45ff0e79b7197a571ef2841311a6b9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6149070f60c1206b75f1f54632235aa4eac5e2038003bbe05023102b199b332e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bc9945df1904024897a46e8748017404fc5525039485e8aab7e9f5a9cf91adce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "6dd04752af14a8c85b76ac402ad67b37ca619ebc562d44022cacf6ee1a936ce4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ba9d90b420784ea3813bba59ee889540e02643fe61edd355ba396d1911c0f8c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "cf94a948b4ccb7bdb5b06ac92cd3e1c353aad9b4e16685d0fb30ba409b9dbe71", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3372043e8f419b680077c09790b05ba9779f2002011acf270d7a22fc9bcbab96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8aa99e176492e6650a0e5ab978e72e4fbd868e9d5fd7d7f68725cae286331bd9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "bc63c5814e09681abcee1144bc0f01608867e9b4212c7235018d75655a58953d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "330b362d913ae68d00b9a5fa13021053495c76960bb8c58092b79bb02a25f9a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "703e05aef54a14ecbaaea3a80c5704596b98acdc7636e8e69643bc0127e977b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "3f05e24bc19ecdf8c97bc1a23db618c868a3fb838a086d5117286f026178b430", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1a5777f3d5e68909b06832b0b70030779f14e23780b4e858dbd12eb22de4c473", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9b021a21df31eae46e8a89d464cb9c77683224104883a28a8b42279bd362ab79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c19b7abb42fec54b28f3a5343f5ac4c046a599bdc2b3ec300e6a40ae57c458f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c80f3a9a7b0e251e06cac29d967d3ab1ccdd98e9f0688f32b43b97e6fe4c470d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fc7f4f1a4502b170f907b47041cdd2480b747313810bd3564990a9cc963185cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "143fe2a22e4a3c29b3a4c75ceb1ec317f1a8595d7fed33a91aa042181d441895", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fda45deab4c56644ab0bbbdd9e342ee189a8390995bff9916aaf613062a3742d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "40b434abc973f238d48178c06f9fde9b5aba0d635c46847e21634204fc0f09c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "38f3dca32e0e1a7546a3aad1a1fd8f2ec880002749e2969afc7b8b55a710ec35", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9dc83ee750fd5ca95054ebc50ab627951ac02f32f25f194d1bb772d4c677bdd4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d30cd14a490b03663d5cc81aea1cc77a28efd9d860ab48f9abd3d8946118ab41", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "ec650d021c08c2dea0a24f53e5f104fb30e1cd521693e2e14103c8ed1bb017d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "9302615898d5b87e1cae551009c6e194c4a5615845869c75ae3918aa1a0f9b5c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e7f9e04205184908cc7854deb66fc1f29faacc84ebfef5f1af61bce54b900dfb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "0fca6c8f07339c2d6989fb7e3004cb880685791573f06debe455b6f7eeb2632e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "d02fcd36919375898dab8b8f335005149593ba252795c30a592ab231aab1b546", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_BN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "92d2bf93ceaf5729e9835ebe378f373a224af79776055e62f0e8041ccdc903c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ce231f70a6235481a6d98693da25bb70ce213b91a7d5cfa8577d9127cfd084a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 64
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "3f198dc4145cc1f460c21b1bf0b4cc533b1d549fd69afb016dc78144a54d0cda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "15030d9e94e99520121e386866a992c83a447caba1cb82eab77fe4e5c13fb0fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "79d03137fc3fa1d95ea274ff551090c9f31658f08a9d06202ea2f84bd2d2de27", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "ece038c0046dcace606b591bb7f4c2db1ecffeb98aff8943a02f14c0cf5067c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "ca82693fb55019e93e219844c168757b45ab72527090b3d6511cb653c311f54d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "b36ef10b23a776a9e0ef0ef2255aa5292720d3cf23d9c2bd37d5e03ec4cdcb72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "7d37c7bc165526002cc747ef9f30bc6f2e42d76393061aa491763dc9beeac7a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "2882937300b7a8b05c9195341b626fb61e8ab741bc87b451724f2b950d5723bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "e15fe54678e6d039a1c198b7396bea5366e1c46c84c4a6b60759626f0c5ae3f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "06877b74359b814b27ad605ed8bea37dba69bb4af93f77701fd8cdf7861e5703", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "a3c187813aab98a93a6001473080550821f1a347a25da6abec9fcf66e3761413", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "44f673382c202b2095117d51a0d7eac27a7429a9c6a99f9643333c6874f0854d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "43828790126e577dfb95c5ba64ff2d760944e25e4a4d68e6235ea642cbe6f75e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "0740e9957e12bb7a3a06b44fe14862d34850ba12f7654fc8a54e1f21cf6f11c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "bbf402a6732658243797d15e13ba9b78bcad70f1f6bcf30903a060a7275763a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "086eff19f923dad5d13446c5527b1a46214962498249cf4ea1810237fdcd194c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "f3403b8abc81245526f218a778a8f942a38477449958fb84617003a0b237df27", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "4171f3b302923b876647c163b47c3c483f2a857a580c177a76b1f47b3b52637b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "b5207e92e9a1e5e0fe037c68db2ecb4caebada86b86ff1dac4bb866428a05a76", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 256, "51884343d34e4c61eca091b5123c0de0917f81874eb8d0db13931d1d8087fabd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "79f9b6249acc52d369ce8f94bbc898016c9d873cdb6e7114a697d9d8219cad5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "05a676564db4aa122cb1c7a1f8b94c2c98f3fac30a875284f55a35f9371dd5d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "e21493685db6dd8b6881ae999862ec9f85f7afd2d683e137cec8d49232a27e7e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "678481fd26f6f238c35066981efe1236c4fdd0a5e60011c19be55c1ee30fcd7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "6704ff30cc0f4ffd624a4c1198c93a63ab7b4d7ca24d2e31fadb97bab2923a75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 256, "d6012959c9e8ef4b7c270193871e7bfd772641a6b0d58f331d0871052d454f10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "1ba67e6f0e91276ea1be60966290c27d15977d4e387fd4bf6a8b0c00e10e0317", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "f267eb5c0fdf3667f97f14b3a11f4b049f4141a942cabae4a3ea9ef14b1dcded", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "92f26de879dcee27b5917af33984e08e5b43b2df8d40e6e5a6c645bb7535edcf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "d9aa19636d052164651f823b073bc4c3771bb28e58750c2fdc6b19769c03ded8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "1e3b0091a7e21b46ce8f4a708da582d091222962333b7a9c74979f050eb17339", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "9105cfeecda900fa84cee5cfb9034feabbdeb5994e7f74e385a21360dd3ec3db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 256, "98290c270e1dee211201df34917bfdb51aef0867df9ac3abc9a920703e4a69c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "045b6eef498d6bdad090e0f7a9ba9e37b849c0571e56feae2a59bedf81af9fcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "ab3014792fb865f4c57a050addf0f5770b25dd3bbd420c73a97db59fe0c6e7eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "fd4d12f775991b340cd3fcd3a1d8d70319211873b9cc0bd1d65296e44fcdf585", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "b9c272588c53021cb2379267e29e0b1bdea7d1fb0da88ca1fe684ff863906fac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 256, "0ff119d0e64e777546478f9418ecd592fe5ea56975a44ad52c427e51010b59df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "5bfd9c5bd562a2f4ef6be401c1f2fd7d0fcfbf9968c9a6f8fed334e14e2e0e9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "5a9d2a8ecafee66cb88600cfb29bc2c6f7f36464e9ac9c3b7702887bb82cef4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "617d99f6b55f20aa38ae0d76fbaff8f567d08a65780baa541c261f0a8dd1282e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "47b00b124fdf43b87524b0b3d0c05cc83049d133b494f3789d3ef99865bac958", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedS_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "80f6740f6330f91a10f5ef27f2f90a69859b89afb05a757695bfffab5f791e74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_dynBatch_sm100f", 384, "7bbfe565ee07f14ac26a934e47d6ad0882f8d50c5bc886ab72f0c4aa507659db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "98144f03ca0ba9434f98647511c00da28873c75cbadcd097333ed560dad33d11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "e00dcdb7714699601d6cd95a6679b25128fa5df61c83dc6aeebca87d63e02a76", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "1fdf8b2bb350278e63d25150ceb92e1a20ee9085ecf793722d93e4f983c7ec26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "d91c1dc25611dc36187eea70681ede294e5d6c91e80a9053a5f3be20a39c7247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "4f2c23334be5b33d329c1fcc1775e8befa40afce03ba5107c5896532ce2eb66b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "ea586370a3092eadf8b608b1dbf8dc72031dcf3965f62c9f752e4b594dcabc45", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "59ff40bb0ae88a50bafc38f25d6fa01695e10ae00bc012a0d33ee0940c82bdf8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "5d83a5f35a910c0a166105a054913b692f9a158793808f243e309a8cc2af5758", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "b55a9730ad00b9a8ddd16698b046fc0ef27a09e2dbaa6f5fce416ff01d18a960", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "80e9b67d172a953ca308dc259500f6360f76490eb7c7bf61224ad52f2f77ead5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "66309bad60622f48e7ff25e36ced0560f0d30d7b98aead0dc87a76fdce8d88f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "39b8e507550fe126208c6b84e5e05ae0744636fbf42f204d7b505131085a1739", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "0fe40e022706bcdc10df8e054f85353d1bbe8ff2026236c08bca42f2cfc24482", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "b047664247ab8c9cf1e45e4d41a15fc10b7359c08424022b09fc697d811c38e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "88f445f5df3d9a3161279ebbcd06f043bf012b845c4524fd1f244e1d41b56a94", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "6db6961c3aa9b4e5abe89ed2729007cb6ff73ba0ef068e944f92be62aec021ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "e3c9f5175688496fa2fc6b1ee4884d54bb9326d672cf397e1b45f724db57e3ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "e346653ec372a8c94a8e5f35472cec405437411e607ac780631dc37e293023b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "535c2135d933200e21fe86dba61b3d7609c165fc6d16a697e16ce861fb7d5783", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_eW8_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 640, "1fad67eb309dfbc1bcd5785659a7c1f29e22d68ddd3476968ca5aa9dc47b7f11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "72980ef8a14a66ff7f93de094d0c4ebf655f2be06cc7b5fbf0950b1515e80b13", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "d65b39dc9821227de10ea2fb779cc6ea774032a081d011bdf3df6941bc3e4e28", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "fd42650cfdee0b1bd4aa0b1ba64065c4d108c6370471669faa5224cf922df07c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "f480167980b485f24c728bec821306581c42a8c8549291e8be6c8fb0f8da14e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_eW8_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 640, "f753154e4312dc0c2e48655fa7cecd048b8f364eba7f43bff6cac024cf188498", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "1753281151ce8fa203dcdbddf625d86740f360239abdef3bea50608eba097c5d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "d71669ff01f9e785cf129fef3c094101993570500a94ec17c03481fde6314f7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_eW8_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 640, "b11e8811024b2bb909b56bd8312a327090d3b7ef883579e79d9252ffbf1b8645", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "fad042296ac84c201d6662aa14aa2b6ff94acd3ce75d959458a517f1bb769efb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "122672a7dfb0f91102f598d041aead6fcf4e51e954a55910c7acd4390b2ab0f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "3bbeb1e910134b0f8ff7593b920373bca43386acf73e44243a2fc5883e9f7e50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_eW8_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 640, "13306377e2504bf96ce4f2370ae50bcbaa6a8271cec3a556d0401fbd92c08944", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_tokSfB_schedP2x1x2x3_relu2_bN_tma_tmaOpt_clmp_lbW8_dynBatch_sm100f", 512, "6830ca018c5c778cdc890b7ca3db1e8fc02bedbc3b3b842471c52a8ec667afe8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(2)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 56
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(2)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "965d519bb73cd6b0bb582c078a95c6f377e86d0f97bc6eaacf6fdc41f53c58f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "40eb2c1820877fe86d8e3fb5985a41a63cf942927a7cfae81618ad5549469c55", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "78d62419b5d4c7d1bf8405d1830b527ed134a4c80df568b6847f02910682c427", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "66073ca1ada9f9e9997cb38e935eadde9a58b51547055b22f21253ab3ffbb834", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "6f208ce1d5578174794d7858e34ebdf59f2111e6fb9b57455b9fd4fa8c1f724c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "004af67108f039ad259759c5334066b10a5ea92ab41f07da13ed3bac6abe9c72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "5e373df3a8c5d61837c0e09a2387a28fd2be86d2f3aee6b786cf1992eb230fda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "05bbdb41869a55fa76f198c2bbeeb7f84d3f891238a2311f279b16683c2d7d7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "4b074305a70a6c3c532ce7be889e2e28cfe1e280f62cf8e974b019bb1f9e6d6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "1d07c021c4474d12d4a44fdc0f393cbc6046b01a71308d720a91f24f7a0aab88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a82fc52bbc1e092797516e067c7526055f96cace981592c90798c88d5d931384", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "1d8191711b483469afd24c5db23b5d85588c849643ee0adbaa08531fb0c8e147", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ff3f687850f75179f0c182a934710aafa88981345bbcff59401ba531a39f3eef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "e6f6f12fcdcd206ca19c301e606ce59ee12cc28689b0c32800fd3638361d41d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a90897b97e14377133e7f8bc773252cffee655a236b5959b0f4bf140b901dd0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "1f2d52caf11fec2b5ec5e5b824b2a693e30cb15ecd830dd19307034f490052b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a82b724cf04014bf34c57be967193863a19e284cb970bf4e25984f5e74c65872", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ded889229c6ea7e7ea9ee9ed9f9ad49704a6173d0d1cf369c4c65504eb50d49e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a576f099252c6d9ca56553f7257f8b530bf3efb2de67db076d9a6bb4e3689c18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "116ec5905dbd4749db604b5d3289db8ec5a5eddfe8e2f90670b902a5ea7a1d46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "5a48a16ca59246b7b10c188872e3ea0879ff6d07952790b6be0cf733537f19cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "32c06d0a2844e97477946e20bbd48767afb3b1bf626029f2ba62b58269796edb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "bdcd3992e844cbb8d61029e6ae454a6a7b16b6d60fe8e7cdf63f9a0dafa0a43d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "0e1457cc0defb933acda26a80821053aca17452d093e563822a4ada72e51122c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "bbdafeecb9f96c439a43517846809a890b95f18f9c8d8c676c3839109628977e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "c4f387421da8c10922973f328bb0c2e06a55850bc846a2cef01e2e078ba89adb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "772ae64fa0bc6ec1a6c2cb5c39df511940f9fb496f4d44ee8bc162f4ec3b332b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2a372502a2a1e4eec1a71f44dcbacb6fa16d63a36c8014120ca8203f8a75cbc2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "e054b8c18ad66cb9bebc0776f0d81ffba55c3518bc6c5b22dcc80689f4470e96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "d0ef947640d6de1f02b3d0f9ca75fea8db83c80865d21e8e359a1b5479e2dc92", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "740e758ba22b79f0bd57baa7a01ee71edac65236324d0ae7150d2637c6749f93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "40e2830820a0427391fc8ed67d288473c7a9e39ffe8eb889eddaec9174b5a9c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a7669dad2bfcb2baa697411a74f6cf0b23de89ee0abf0fd664f5a06d9b9a8fc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "5d0e4589729b13db4382bce2bc1b57e895fff9a6c3aa03d38f1cb06625056c0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "da9c8a0a0ebc5d5ff23415343fac5c476482796ce0912db90a7ff2f7528cb5c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4c07876c49278c35dc2fba4763713c81cd8a947990f9604dd76dd0212a263fda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ec2737b617f069950e710dde1244d56a598a2d8ebffd1fb432026054e01ba747", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "2e8977371631a16efd283cb6a0d7b3b1f6e50e673d6dfa6cedd6c6bd62600a4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 192
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 192
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fae1fc549a8d10203e1be9d505bec3c5431975c84c8d5e4f981003b7440ca096", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 8
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 128
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "10beb8f6b8d2c7531502988320ace49b43835c7393a21924d75ad5e4957580bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "021ebc8d6ee8774be6e8a67e54a3d16355a550a03a26854190597ea444ace8c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "2923214817e855de999d3647c8569359b59764ebf96b96239be209d65eda397f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "2960085dcfa0262dee8861ecd9758616bfa65d9fb5e358ee067ca310064c6289", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "7e6e1c2d4420df3f548b970f4ab9239d5c30b8e98933315cd39476b87e2ab5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "d19e107055b8ce0a6514a61f96373faf346f2d7e44866bbc616f1f703fdcacfc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "399f229b44e887d96c8fba803a57fd93d8c7dd8b3f7fa503e27bf4b3aa810c0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "dfc717588ff92998929479d623ae628e6db5f294671792363c69cfb0b1ba7775", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "22646a90e0be6936f2e0ad73a372faa40070ad24d1e857be88d96960da81f9cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "8b8e7852a0539885a0b5a08a3e7178bc2918545830935c203bf669fb3ec69be5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "8ad3f83845e85764e88beac550442ed295c023fe956b7bbcb7d3e71bd157600c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "9d2ee11c0139edf36909ce9395b4dd8e8c0f17467f1e469c5b204f499cb4ed8e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "6d6d196de27952b3bcaf3fcfb6859b55df8ef729e57596aa606066faa04e0314", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "22a83400cf6438f0000ab6caa49116e86dd3a5fa9884998d8349531a50e69151", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "7cc026fd7123491d5240606a5f4c404a0886387aaccc5198ac5c2e37911b4508", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "0fc6402b4cbffe9d54d90f1239e5f51769d8be49cdf316b6d4ea20507aa40a64", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c3b237104bf0b6abbcbcf7932ea3faee37a8c5bbbfe763b3028134377180b6e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "f0a9f5c7a4c5a4583b900af19c27c77f3c227f7f3c9577ffcf9ac19d83f44fa3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ad0decb152063e7b64745f1eac0cdbb3ca62580fc20d269a0771508e125c80f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c85f2ced439f227982fd6c607b73ec77cafad06ef388ad117e676a4341c332cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "a3b5368e5e225a6225b6c4368c2a21697923b7862b0ee79cdef27c962b349c90", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bb1853425a3878ddeae30537f8426d89ad343ee49add0f1a5e821f20aa52d094", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "51cce492165f2e315b79a5d2d9e1b300054ed1aa815a0aa2473f3cb4306c610b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "936cda67cf04e5ce0b8558a0aa512ebab07d50dc9c251421beebb6db1b915191", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "cba6f5ae1826b5054231a60b49497fe617327726210d4c09cd187fb36e929198", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "3c1b34219efe944641303ebaf376d833a93c419d8e2587dd80657174a1be793b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e2988a877d1ba181a0f825f2af0cdf6bda333f75dd863eff789834ba827dce71", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9a2042039e1b60160c6a1571efb3eab128fbb8c31cdc8e6b044a11d2429661df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e1a73586078bad37c732ec4c638077cc964ef04b6dc7a8bc62a3e594e54fe1fc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "95303a03e09def72feb4bda07ded9385794aa3458017ea2b4f2301fa351d772e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "565b09e0149a3e56dda2ca6689d9200e8663abe8106bf90b5e6aaeb357ec5b22", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bda0d9ff286b903890e00342b7f104d22238ae01d493632324caea799cfd8011", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "20c386d451dd98210e4fc35814be10e385d3cb4a946a25f6a0263182cb2c356a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "33af18028aa4f86fc38609b12122c33dae0b7cf58e66a26126a54e2cd51f3868", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "7ccb4a4dbdc9e3374ef5c2928f422a058d2f751cff18f988baebc32edaa1d1d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fa0c1e752f7a0d4823f7a595a6f59a27446d3fdc6b7db4c1b4fe836994e082da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "49f38714fcaa0db8921d2e6a475439d9479a4daf8eebd9ce6d0abd319175af89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2b48514d5cc282fd5589eeea5c54cc26b95a337dd326b6a73dcd0a32ec15468d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ccf467314ecc8712a92d249f349d6b33bc60bc15085253c691a2d7169d618d54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "02847f6f805a55da36ae25a37b02172af213261aab5b13ac3f75ebfec1532b24", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "0b4b1cea155ce6b6ed1e6adc62a97eccaf53a96afeb34a466a0e00207fdc240b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2a7e864b771dfe376c2dea56cb64a1630e98db5e97167f414973a91ec38562c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e6ca8724c14aa688209f7670f7e15311426a11831aefdbf11d8a37c4844483cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7b7a2939abc0ca44a50639c43b69f1ab63daa8fc86d0a44e373387d069e2460b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c30d75bd3df2d4c26b3929b6bb9ab68ec242bf68bf81c7b16a8474ffe852c93c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114144, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ec9f7f48c12af0832bedc8a3615d8f9ecc2099a4b2805762defd66bc3f179529", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 149920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "be571b3f37f30431538b371417d6a63199795063201caa3d751e0c99d3549111", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185856, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c9d28e22e93ba42d018d812eb4cfd153d067662cad8d5738c25c56b015801600", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150080, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "716e3ba3b05f2c404298b5f96961de4afdcbbe8736b0a1cac63b5e0dca301785", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "4e282211fe4068e0072dd04e81c98abdf48beb7c99a8f8b406ac004b16834e9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186016, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "31829250ca2fff841c927e21e0515c2ce96514da225eb5981b605e1ebbb9dc54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b41aa27ec646ac039e962ad487b2d6c58f3c6112e1c9725be6bc3c09eda45070", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121152, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "facf791b9a75af755092927e30017e42aa6d5ad48c6bf9895757c6d4a5274ef5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 159136, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "71c71a224614d75c7c363b93cc50759c3dfeec676077e2b07f5812b5903ebdf0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121312, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1ef38834cf71197c44ca2987c98112fa23e386be47efc057469b92cca9141c80", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "9e53ffcdb653d7dfa29078e9dbf3d3af5355b214158af4ba0c63c6f8b8bbabfc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 159296, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "31a21302cf39ebc693a9c100b87852fa56b701f277c70ee15ffb2a6f26751286", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "8849670e589ca7ea449ac0858d6e416d9bef2837617c5d3a8ab55c8b5f935bdd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115016, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9013ca54e676ccce71617c3822e547cf78c5f7e554878d3fb672330a8d36b641", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115176, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0b0b29a5ade031d75f263f05493ad6adcf317dffa5a20f7406a078691ec3b2a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e5ee7420ea252f0fec99ddcc08d572962cf40b61636c1bb2d98606f0debb31b6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186888, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fa09cd2bbd40c342752bc9841cb81dc74d49c0106b4429c6595724c541ad8603", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151112, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e9f570bd7891109fcb355c96d1aab2f87b73212234e817541d8832f9aee63997", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bdc7ec0782a3a34b55b515b60ef13e05f2d0dc3698dcff2d748d1c45fd490f2b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187048, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "835daa28db609b05bdbfcad7681a16c4f868a5ab5a62886d3dcf8e5d104d0832", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ee8c32c256f4316686df9def4eea0fd2e06880b98e04e7bfc1729869a3762935", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f4c43299ee9370fe93b09f93c536cc5aff7239d5201e8e54ff91e1843f5fc48f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 177568, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "85643d18688c27f691512b267474669ee43bf244d1d997ea8da98805eb59241b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135648, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "078ab46ba7b36240f274401cdc45ce21aaf51bdff8ab8d1e3eb14a4e9b5dea2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "27819e44dcd0b14ccca7d2c7e029efedd090ce752d56c3ce3b20a44174865484", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 177728, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "104c131fc1468d5212b754d02567c63c7e4f151b4d5804621c58471162542c8d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "e08d5b3d08c79f57f0136ab073cb1ba99e71384f19dbe0735088be1e318f2123", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "49790ce11bafded7978be96d412cf8668682450785c9c497b967f59b75c93644", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161192, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1c45fb583868800ef2d4a76dc7c3e63003141996e6f8a0360fd2ca9061ae1df7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123368, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "567d0948ead482b457cd23685bc048e9c39cfdf24f1c2be34eb0e18e5d42188e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3db7dd385e8ee9e2d1b824908818ad3e1bc7b5971ca8b11f50794d1e8c3bf2c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161352, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "c12fcdf8c72f88ddb59b0cef2794e9624e55c4a01cdae987e09f4bafa0c3cf16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 112968, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0ece9ade2b7e139f163442f140c8a8a62ec5db94bb76016b10d748c1dc13e74f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "beab73418dc694e78d9bb2f22c1db0c437e815377a4359658fdc18f50fd36572", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "4dd1101b33ee70fe19a49250a45de0d3fc9453e390a37d52e177386374f425c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121320, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "97abcfe4ee68b9537fef8dc16b15f0940db7c26879f8d486d00128273bd25616", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "40ebe1bcf34fe570bd940cef215ec95d6852e70f48e64b2238f2544241f4c97d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a7c382b458084fb98b73adf5e34b9e72fd93012b18b94c71f773ed19bc9a1ed3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d7025957f0349b987cd5a6fa8fd357b635a9d136da699e58e7e143536f075078", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193192, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ee10dfc0887a6c5f09aba60eb7a80f475f23a575b2d39f087850bd98c252076b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0ae5ce3da6cb4071cfeb576efaada4eb7193fc7cc3f1fe76fa150b163dcbdf61", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "1d9cbd9c28e35a6b93eb8874908d822f2d8d49f5d7306aa2c50cbafb0795e1be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "da43e1113a2053c4f73019b61184d740c4d797e31ead7f641ed329f182e871fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "3775294ccd66b7b7711b4cf8b6702a68213f010987459c32cb186f9706f3e3bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "df664c91f9da68f1cfdcffc68e7fd6075c598d7c9128ce368a4ec445a80447d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "cb1e9a5d874331298acb768f65701d9da7a290f76130c939194fa0f2abf5f5cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 149920, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f5a8472022d7096edc9304e348f373c3a31477d4c2894f2701e35dee184d0871", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114144, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ffbdab1ad026dbb43f2eae0ce9f94d27c2af4f1ea4995a507aac2854a5746c07", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185856, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6c81101eca331f2a3fc301b9677667dccd7244ec63a523c7541e757cdf7d7717", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150080, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8f277e9fbf533712dc61068bd9e0ef2b3597e586ca31f13181797695f1e9323a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "72fdee928d0d6725f2fe0354e6640b9a5ff8e6b05bbc7c46d617fbf4ec926f5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186016, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0635b09ce1dc68047733fe62abdcfc3307e692223080db7ae8686a1b1d97a055", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "82561e7add668ce01813088601c39732d97539eab83693fa79b95190f43f136a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121152, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "94a82947683c2b248e7c5df9e9a32ff031d8b1d7ba10812168fcc995753f1df0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 159136, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "638037609cb129757a66145b9839c46c5a9f836a8716a40a4a2f78a4c8faed37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121312, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3f2e06195d804fef8d06fd2ee6fe08976492926fccc6a267f7e1ea595adc4c21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "7bbbec6195fa7a4550edfbc13bb4467eff32f116c672cedacbb63c2f45a638cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 159296, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "103a828e48d3939fa2a07ad7fceac328c65d91d7300d3619703769059afc4158", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "691df04c8929c636a6a9301cfa1be4629e7130a7c3166440f156a76426a08a10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115016, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d031d13e87db89f03b75b0f4990e9a9b198f8a4dad66d5b1dd466c294864a68d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115176, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c004eb9429b65d410619d64c638ba92eb52d1c46e47c38e50280eeb5dd4f6640", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ad98298579b04adc43d6587b404dc4abcd422cce66508909d665c8f958439e9e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186888, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a8c57bce5d8ca618b25b799f730bd68d8112a4cfcebf44f6ee73b29ec406613c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151112, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "17b887b2a5a2fdd1cd4f875227c130dc3c8451036afbd681b4cd07a00a84a908", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5c117b6af94178c9947b60c907400e75154057fc45035bd392246da2cdbf071b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187048, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7e6a24414a828951023e055b615d3cf9fb82c8242bb4f3daaaac8fa95a45afaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "419827cc528a66e34458700aa5bfc4829830c3b7ea16344967b25e98a2882cba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d56a5664abe7460c71fcf5192eca6f6298aacf317b91741f086920728dc3b6ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 177568, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ea2bd809a5f015a28d63341b9683536d0beaf4896b724adea6b02d9f4434d244", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135648, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "eb6ff55c20cf47d45f7cb7f5e6492b8fe290b6e42877cb28e8f64325f0f15fe5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "a6cbb57f89cf2e573c2d07b3756082aa43e241ea7535252b5a9ca71a48eea4ac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 177728, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0b302201fac879464ba5b66ad433d0c7129ba5481a9f2b8e1ba59fcad17d221c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123208, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "8888fc59afb90d1aecfd01708aba7b675be51795401490883d7929c0ed0de11e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "9f4a234a4fa23ca95bcfc26cf55f8a784775918a2a52bec1a41b77002f6865d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161192, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "5b1690d2cca1be2017d55127c9b4cbe0b1f112d6966a45e06bbb326251b89665", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123368, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "81b4b246c6ccb518becc19d3963dcf2933f68f7035d54655f79b7b70eb583fe8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161352, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "046f721f365723357ea5ae7e53db279ba1168443ad4485fdc38441a1b4d5653c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "66feb12020011bb4914ca1a0945cd86913428845a6f3552cd1c1e7fcbcfc6984", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 112968, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0c0be6b7e1c2e0c58e41aefa838a2c2f3ffbc5dbcff7a6ea46e422e8c5a5f438", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ee2aad30ef21ed52be9f9abffee2cbd728c092658a6f6446af876cd56a92521b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148904, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "57ce968253c431c4d6636cad1780d5320e26cf1adf6f8f05fab221601a607938", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121320, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "382cda3ba4a508c5fcec852df7671deab201dc9838a3aed9240470d9ce3fdd7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b7676fb7c13d3b53fff73b0a0e5cce6100f79e076fa11338cb76224ec65f66e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5ed0eccb20ae2faaae81684eb62b2d5b9b80279fed1534b4b048c2b221bc44d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "07b341d517ada74ac891863b410c4c79ddf67125b69cfee2550089c8f0dda62c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193192, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7394a594565c6bbbf78a1e633274b20b7c36f36f3bff30b9bd6ac0a6e17c7112", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2873a923f19cd02d2b25e24bdd33451df81230b2aa68e37546b67f7c5d4ada19", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "f728800474ac95eb3c83f97dbd36337746102f40169782adb2a6fd52b71dd302", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 1
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 1
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "0650d096d964e79b42719f9dd104869b016e8eb856ffd2901ccd8b875d8ce24f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 1
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 1
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "46aaa431ba8c9671bf1e20d46833f0e8d4263890a9b443cf8e34234d0c9297a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "839d4f1ccf6e619e461c20dc7d3437faefa34f3fd835e23a94351bffd0c07a8e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 1
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 1
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "9363acb42b15a6e2d71c7738e6427c95c9752149767d067a3c3c91944d6543bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "e06f3e2c951c09eaae5bc0fe517bac2d295645f71aca6a81bea7b1c8463a6742", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 1
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 1
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "e0dae5e4a8c6e6e7f321e8ac42e727d2d11d627c011a92128772991743bb7fa7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "cf41ea69a69ceec661c51722a4f600cc08a068f7d4a13e1b746e1ab2f4276520", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 152
, /* mNumRegsPerThreadNonEpilogueWarp */ 48
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 48
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 48
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222872, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "bd468e17a268ac60e22aa98c709c882755522e9c135ab5b4ef9cf386d1be532b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 88
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 56
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 56
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8acfda81f1f143c253930a3a08273e781b08aad417eb5b8eca366ac34025b36e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d598e328accd7ccf07872c826b7807ca7d74209685ae6461d9ed4b492d921777", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6202e6607eaf4aed5053712bda28b97b391485084375d8e338ec91337b3e0133", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0a2a2ef7b3edc126e592719fd878b3d3d45f5051b8e9a30c27d4f0af4d81ae49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 2048
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3ca716abf18848ce7a0828d8be32ec4e395454bfe9feb3c1476c5679c78b035a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222632, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_bA32_bB32_bC32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "0cd2227536681f9df24b0e96b49a343e8839af5e27c966fe63b357c345c6c75f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 88
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ 32
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 56
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 56
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 8
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 4
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a6dd63534169356106c9c6f5c95266cd8dee5e05f7555a3f315ea7eaf88fb4df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150944, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "bd93aeee74d680100d76cbb2fbd0cf38c3c07e4c43cff37eed5b3b421513ba42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151104, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "efb63bb8456c4ee75873ed66fb459a4e8859df5ca56240800e9ec337652cadd5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "798802cb7ec62e69f2f3c033dab4563f76e9e384676deff560af3a55d03d9a8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187040, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8e94efcf81d11f563283986a6a661bd84658a70de871b14e00a6aced7d69c78d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a55068b24908bc5e2d80ab89cb0baefd689034dd59afbc7e6304aa480660383e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123200, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "33c338e3fc2d83269b15480863671947f09f1f2799a61c1bf74386e96aad28b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d15b42bc8c0d62ced51c503be6aa28366f75b6f2af15e4e9c8555fa89ea5df26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123360, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "21da0e0a94fb916dcfe843898d3d77748b443fb46d5f850f14c6172d5eeada5d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161184, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fb58aaa672f3f5493ae9355c4f80bf91f9664247769ffde58535b56b20e83f53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9dd1cb08e68bdc681e7af835795f26656533c988487e23022a5ea339a813182f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1483a4ccb769017301d8f78823c9d351d55ca240d9dd404d7c53a501b500b67f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4ee47e3d2e172e22c7e2cd5e09f0284b796e043a7375f22b28901ddc1b332c0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117064, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8b26a340dfe02e5f1acf05b230a22665efa949116b0ce0f93468417b80eb0a19", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153000, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "51d12db72f034cf2d839644718aa84bee8e0ddfa377ee9d0caaca4d516216d9f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117224, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8875d72f544a862b1694a8d6dea6731aa77a9befd45318e8d9e3a34862cb992c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153160, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9a1bccbb1a92fc608085c165406834dd3ebdbe79e7b5d77e52abb535f818bf15", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 188936, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7e51627b60d187bf87db31764afd857bfd00a356c18a47a2e1bc34a50bdfe583", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5d9a8ef8c7955847c0e1be42edb662c758e29c5aadf89b99ebc45ee4667857fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189096, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f72a76c6954c4ff84806cd85138a49d506aa48881fa41f69ef4b0603f7c28611", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f084c90bc93a508846c690433ebf021c7fe4132f568917c7212df6276eb78a8a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 139584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b201d29f441591fda35d36cb2f57940f7e84e7f39fa7d9c7fb072d3bfe18b631", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 181664, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1f83f3f2558139a13adb8594f3b22464e8670369a3b62fba2a542f9d6372c5fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 139744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fef94093e76f52c122a1bfc59fa5383ea5e8e41377c0a15e8cae11be3f4f4c31", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 181824, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e039babb0d4d7711e0a3057fd8f4b31abcfd2b3759e2cf70ae2b65013e727e87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4627fce5c1a7193f4a15cfdd2bb100a2379d66610be743c9694131c28b219026", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 127304, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6c15f3d29d5352f4ab912626dd9d428c38ff6041f7c0657a016327eb7f98a1a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4683c52aa7ac6dbafb7952bf27214ca5af4a59d595337118063ef5c625fe2d58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 127464, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a3598a7c09f930d06a996cb9ecab2c16cf0d43284f4addc898d6b7b926081c23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 165288, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "137e06af737a3c7b889d222e50795139cd17e54b7e7362f7f9d4a174713b4f18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "66c2fd2d89f0190357d8504c18a65d1e682152a99dedbc867ea37fbad850cd17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 165448, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "02ba67dc2fdf5a56566b939628d3615145c198c2c7122cc56708259c8f4c765b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "63532d00d85cb0461a2eaeb22dbdb1c97bb85e6d3286cdaada2f47b18e2f2406", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 112968, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e51e7cfaa0e56a323d11eff2f2c5f33ddaaaab7d761374d2e4cf9659c9c80568", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148904, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "10b021ca8b63f8d916bdfe89c90f67c1e4e1c17938a591c7b4bb4d8ab21a196d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121320, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "16cc80be4637bdd1c5b61284cc67899586ba45e544cc43ac33414f3ff257c0cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "33ea319faba8cf67dcdac915c483fb8e7f4101568a273e1d644e5bb3ec3e3c71", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157256, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3e9f09897c4a8be52c3a73d5c9eeecb1ed7e665ad146db6c13915912989f5ea7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "72ce3e301d0a85d21cb1c1b1af95be9bebb3f20fef80b5750cbf3350b48355ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193192, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2bf4786360f048abf8c65f746927435f938018513374f1af0a75b083e845d9e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1d615309f7106cce68921d520accd625e2716fd4d02585968f906869ce394d51", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6abac858c18ecb9d565aec2ef83654d7c07975db86987183b51d507cabee413b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a7b5aad2b969cfa72a9466c90ce6d80a0ab0ad284897e860a8cc84676ae0e85e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8c97f5dc873263b1b26229922128641dec29739a7a44b6d301af1cbec2b04b63", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115008, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "51751e6cb479bac291e0e1dd809ec34f7e31082b2770f0ac70d064b799893874", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "12eae24e41edc31ca5d6e4fa2efa02b9919ac36c078432cf3e6a3cf6255ff58b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 1024
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 1024
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 150944, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "be62af719e47084e44329f8604e7441b8a4bd37a024984438f718cfeaa6c4fa2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a6b3a129cbc8ffc476c5f6a49a41e73a9598864d74dca22c2f90ecc64caadd5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 151104, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8259060612a3128f3d20716634e6dfb816dc1d19e3780e142659a8b22c5e4957", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 186880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b862630f86bc03f4d279d7f43f41ba07e168b5a510e85e5014878bce048de89e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "493edf0a08b84ae3404db569931d40e948664a6f3a0a7d3e1b23629d2bc9efc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 187040, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "98c22710ebbb31313ee06a8799a061aa75d679a64a0da1da91cc7639913bc291", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5644235c60a7ddec6db740a4684e63d4fb9f81f2a73fb1a49fc35a96995f6006", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123200, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "da2a46cbcd7ec9fb6b5f9f88d9f8aefe33fb1cec9dc65820fbb0796a6b82f23e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161184, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a31db65787278a77c7bedd6dc4a56e7584c0e62c3d1a03c19951fdf1a4dca6cb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 123360, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "25ae723c3305478dcfe272af9fc8c0a16538f5b25b15256f63ce8feccc420195", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 161344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f91acc77f18ab43d2f7d011701e0d2e6f8ff0e9055f5bb98248d3742a40ce778", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c527fbf094877e3e21857f7e639ca5f87d3c319919f39d427eca975b50848cd7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6c46430b4fcd9a3e78c2df8e907310adb9632c25b9eba747218b86e6e185142f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117064, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "17cd0d20a4f9f46e35ea44c2a78ab577330f472e1d586e49d6e8b4ef87741b3f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153000, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b29cce66d5f85ba85653fab8fa7c59c5e62e424c20320239ff0de1fa72845d7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117224, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s3_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "55c867cea243913a4fd2ddb5923d0ffdb6db2e68ab6bab050caeece0be482e7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 188936, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4a8674a321ff474b1732bd663b12007c4ab90b6e01aeb2489c23537c3d8f07aa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153160, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s4_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "dacae91e546b196f17dedbf8c764a667a0ae4f410dd00a0d48f4476b33f47e89", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "be0e6431facb9e7836983057beab7ff3abbdad1a742d774d463107337ea3274f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189096, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s5_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "47512f42b9aeae4cd6924e0b2b511508da48d3b5b2927e6a655dcab8607c3913", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 139584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c7a578f3e9c6d01e00fca7be16c0b9170d132a8597da085e144baf48cb3f1304", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "652c66d5f823ee712880c604b61c084efbff7ef90e1e6f6727a56b9d26f01b5e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 181664, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "686ce31f6948da3d71736f2bd91011b9d2c7acc4ae0f910045116de322059985", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 139744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1ac106c7d8820d6aa9f3994e033c650dcdfcb634f3b98537fe0f7b1784e9957a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6ec9b2a9b7e95f5e4eac7c8f866ccbc8b6c0e8f56ccb5cfa9f1a9d3195dc5851", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 181824, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7a13315592e6ed5c7cab26060c66f377813bf4799e92b957412cc05d4b4b58f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 127304, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "57398ade6604d681ae161fdcfcfe03907b333692405511d1bc0caa416965fc3a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5f2f9c44225ca399614acb4f9cf7612a16b783750c3743d939f9dd4be1bc0e0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 165288, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "22a3c58a240a8c1c612b74aeb36b41fa62cc24591c38fca3b75da8403320d591", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 127464, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s3_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e512627bcc9ed095a9c297fb52fac000516e667433ca6dc8fccd03355bc62a2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4fe51af5eb10621b751fe643edb8e04032c7025c477a3cf15a3a1638a7294e6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 165448, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s4_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3495462aa01060d40475a962f26c0a0a3ef49a02758b6f5f0f707897549b359e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 112968, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "19737be633c843970f4efaadbe727c561fd75b3b49df7b7c81b1d71417654ad8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b2391b0742417d63c4d23f8864eae2420fc3b5a794ad95f21af1ed12e3f359a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 148904, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9a81500b5a3a44a98ed1509aed7d5c54ac65e1590599804a928f9a61b76f381c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121320, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c8182140e5231d8bd102a76d3cac28813a268f6f8ead1874fdf1fff47dae6a67", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 184840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b588184a630a18282cbbd007e7238696a1a93aaa9bce07b017cbbe1eff3af4d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 157256, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "55defec13372dfe9bd2cc937bab652e9bfdfe364deb837fe124f4656063fbf82", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b4341dad8b1ab34557807c22c0ee88945803f75b4abc60e4685c9e1e067373fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 193192, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9849e5b88e107ad6e01212552c30491e854d490ccc172e888965321c87d8f859", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "61c1e8a765fcc0d26ced4c0596aa09d410ee0c645ac11ff0a82c281d225ac875", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1fe2edd7e2aa596bdc5aa80565566554a43f07b90c413d3bae186dacc9c967ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "013a493cdf19e3adb879ddf4c71cf314d0227b969319e14eef66ad9b8a9e25f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "076bac19ff257891850bb3296b290eb3e8626e11862c5458d8338880f0c97ecd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "b71af69f688e023e6593365f2e9939b110f27a49907b3c61b6ff0d4774f55ef3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9515ea5c4a0f145426e1ebe5843f91cebe2551a41868ab6c87bfb4abef730bf0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "21deb5e47b391d56c0edf2d0a0a7754999f3371f978f226ecf09068f635f5c9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a35fe703d015c52dbcb23686b8a7f82473dedfdff071641bd065ff915f796649", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "7b4afa29adf129e891a66810b6e8a3a9b8048fca356fe6c032cab9ed9265ced3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 197192, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_bA32_bB32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "b800831acb87c62174e174f023c2ace724f274675872657a416d6b92820f1b25", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 1
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 256
, /* mMmaN */ 256
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 144
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ 32
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 256
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7b6c3b9793c27e8dd2dc41a39ec831c408b129cbc5ca097e1c1ec9f2133770a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5c5ecb763b411a0462b16ab76f2963fc7eca2d816d0b33f0e74813fdec22cc99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f64532054e2a78c8443a9790192e5f83f920737ed116fc39e7d8b26c59951f66", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "34ceb9305b53a3b74c42f4258053affbf357972c85dfed5bf7e22d3c683c053b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d607dd0e2a5a17ccf8df8df6fd126727cce4472c6a3e072ce8c681734bcf5a01", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f93563eaaa11fda0d3c11efe6947172c4daff6e35e24b71b82dcf96f9e0ba671", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7effd19635217e443336f4808d680d47840cb0e7e6b5074e23b61eca72574dfa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2b1359e73257975b0b737c3751d186323937f38917283552a87cbc1b4a46aafa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "289ae2d66a328d71a447a4faa19ddedd24ed1f387cf0496aad2ca1001c521422", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "01c89ccc1da32212f3710098ba0b64929dca1d8f9dbe2244dfc90f0e8ae42a52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c65bfe846d6e27fcc7d7602e37e464e086ec9406f57f911cfc00f30027ffbb4b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6f63512c02db76808ff7436a102f6cc85d90c70289c6b132f7f73d3d8a94f5a3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f805f365269cc1889494bddc2320ef6629416dbbbc9c8ee7c7b5f47d650d26db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ca543e08acde717b9793a5f406ebfed78777c80693e80162e16ca8beef9abc75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2249556b437f39d0c023efb6ec7f4fdeb45a62357dd09ae939e1c00958548cbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9e88c5d0c106bfcf1442b5fd7b9cc9aebd5912f01807cdaaf2eb7bb64d6d2dda", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2f072b667a630501f21badff4a40af57b2e48b0bed11ac31f6a51ac89960432d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "63f238da00fe49d6a7673c5e4203df209b0ac07cd23c5daf5fe03bc8e9816a65", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f7be44a32448fc81a41a29b06fb6fce5b0a928d14e7ff8d1990d43ce992eab56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "99dd51b3771a3afe91042067f0d16ef9f25f5a54e01e1f829fd1c992c15f758c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f9d528827e466825155c3a93ee5a7f0c907347355c6d8a9ac2b57691c2ad7beb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0b1b009a089c85795f6008ae15f694fb30d172f35f59bcdf7b50fcd467af8fde", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "150e302a3f00b59bde605d49ec2d89f98fe17ec50ba395038933c544f45b0505", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "85344fa3d1af49b39da700403f7b77aa4e5f868ee350aef5399f1020e6e27268", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4dec1c843a1d0812a38caec557733b28240d1fe14cf764649e2282495a295d81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8f0ea81418203c749daa5f5d0bf4bc808cce4827cee73f0190258aeed0a2cc26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1becca00ea46808557875ef0b8c51cc05bcdda16021ed51e6e7df00367203e40", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "71684c2e0c1a3ba48e1382b2ffd1aed9cde16250ff71185d63b38934ab164d05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ce3040040889942fcb53cfd0c59587a291ad6d4baa19f8b7d8c1ae0b8121ae23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ff78735ceb04eb23dce837162c7c3d2458ed19414a3f26f54f9a23ffe050e874", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f0a41c2a71527664d08a31d871378c66d1b4b0bcf198d9ff8898c52394dd6e5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "47eed4533b423d274b3d9025c3eabab159c14ede53484f6deb16d415e789490a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "f80eabb27d17931fb2cee4265ccbedb5a2f039e55842c0caf11e8e9bcc662c1a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e72b8b7756100524798950327dc7d39be62b2ffe1d827adeec5f3f140d753f3f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180416, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "c188bd54e6e911dd44b7a71761dcce7b861f063ff060fb90fcb9c0fed4e4b084", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "889dc76cf9dc4f98de46a32ef2e617d97ea8fd4d34e66bcf9cef158ed7390f47", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180576, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "d5d77645c44033019ae95239462fedd16a2abfb4893a0c4da8df121bd987bfed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145728, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "ae7d9fe3ca3e97ac945e2a8fc65e756a307c10ca7e2fa9ffa4708e2c4b941af3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "182c4047d0f6a60d08ed8cec34fa7e3e420fc046599571138e3f43bdb2318915", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "92a3191ac4c92b6b7ea2b391f3a04c9165032ba402790fdfcb8c5fce15ebcdde", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192704, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "e0cd7cc3a2b8becdd28ddf5c4f03f3ab71a140dca4212114144032a61599dde4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192864, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "534328318187f4bb6ccf067a4b947d05aeef0c9f846db824587c3b9280855ef9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155808, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "0264e0a1acf795366bdc75e0cb3a6327d2e0dee65b3178d315501c57100294d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155968, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "b5e5f7fe5a76beeae63ac01ecd560c9a2a478a8df3d8ad2aae5c5e0a4b62c2fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "a386d43cad6402f5d3611f36a3c312b130493761ef188d8fc8606fba60e8153b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "9d2eb7479629bf1c5b391c760e4fea66bc9e70dd814f3ffc5cc3c95b0cb29ddf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176288, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "ab39ae5693c94d54e6479ced2be579612ff35960ceee3f5dfb20573d4559ce36", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176448, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "aad2160da0561af1b9d4b530dab897751177f8a435afc1e54b635904a6490f14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "0cd977643fe5a5d3d5ccce9c9b056f7cc4bd200281bdc14aec423e18c60c9c26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "32bf089473c875a4acc3a7c794184edc4eac7194265fd4ba616f1386727bf03d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180416, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "7b5825a538ec80a25466b59feca10cee9dd41f00150fc4f6cf6c2b331dd31342", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "612b2dce1ac253d1d6b4395b9ef239388c4884b4a8d2a22dc796423b23243c88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180576, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "9bce39bb76b0314aed7912636c38ca81dd644779a388e723d28d01adca00cdb9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145728, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "5694a2b3cd5a74f527a69bdfccff272c559a61683bfe1c76917c5cba4b2a7808", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "28ba51b16423e2f2016cb2d58748d82bf554c775f37e24f13750ec8e6cbe33be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192704, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "b73836fa8ccc51d4eae1963f055562722af1d81b6147b67d20d5456cc0ee80a3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "d4a72038947212b9784d4565a7b80054ec2179fbed24b940be9bb4d0fb14327a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192864, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "97ccc39840d524fa8d75a0d2e03203d1defa83d7be11d12368f4bd3cfa726aea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155808, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "a461b3c5e728493f29e99802e85447c6724375c487a0a540546aabe2b32834f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155968, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c303da9a102b2b60566610b45119e264d5994696a3cf829703ee43420c3f3b0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "fd708c8ae33ee98d87d29dfb43f1540466bacc89823684071d4d63498f287887", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "6c08dcf6608d980a1a8e5a50c11027cbc67465236be3da2ac3a966dcf68b669d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176288, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "bb32a0d2989b1c6badfac125cdfd5ce2bae7a7c019806b9a4bd058c8da796784", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176448, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "8957a62704518ea457406f6218969fd7223c06f9a96bd11d9680e66bd5556f39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225640, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x64x128u2_s5_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "19c2cfab9310482f857f723f244264ee16fe1e44e6f4e1139f5831f745ae9232", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185736, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x128x64u2_s6_et128x128_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "145a9b817d9cd2a448535147d26f47214c3a328004be97b21aef33b4331d03cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 64
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225640, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x64x128_s5_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "645412e8785a954eb82700e8805e26f97ac1f8e8e4139ab9255c9958e2434360", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "8738b24f81d22f78a0fa1eb853f2d621279a75d6953d0446ef667b0929e27c14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 185736, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x128x64_s6_et128x128_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c2b330087b82c2f10024df1d4c910edde9b72cfd8ff0d3417aca34ac89455137", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 64
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 64
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 64
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180416, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "303c6fd50c6837cc724f2f31fb0afc89fd92ee7120dfcb2eafd893fb6a08d5f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "16588a044f4658bd1b4bedd870841be17c97d52e9b90378f74d7c27c5cce20de", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180576, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "40d3bc18bfc240b57ba442b04f029e2626b1f3ed3feeec7ab40ddf0861b6bfd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "2ae0831916c9806e0be8bab5c0786bf6ff555c5ab289a761b61525b6aa031b10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "9623995d99f8f83afa4f8ab02b13eae259d9e68566e1ef96bf41b91b190800b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145728, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128u2_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "142d8a0ca0e5de125444413bed579f44143cb9949df7558c1e58fc54e5f1cf46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "7e1355c4d64bf217908d99957504a6cfdba160674f6abbec606c73adf8a4a603", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192704, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "9fe1afed48a6fe332ae91e76bfb6e42b5703c94bd6d02a4ece79070d127cbc86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155808, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "0d0cffaa3c855fb315302b1d35db3f96a861e3a115b2c857456f384b4c4ea5ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192864, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "077c5d6ccf3dfb062f3bacc94fad2b63e5f254c14983c5d5d9e0290e22ae5f79", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155968, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128u2_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "23f4b417548861602f6fe89b376d3b347cf2cab38795b9dee8fd9e8e5988eca5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "fa4b752b38875728efb6f87e9e6b138462f866fa5db02ce37c99c052319237d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176288, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "e9c44200c315861b17c25c38494e480667431887c359e4d4bd731df27d24ed8b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "3ddae23339f731eda358f8986075f1deed6893c4bc8d0f3bc13c4ffa8f1f69ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176448, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128u2_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "4a71a44ef4d2cdadfa362c6f84724bd6dbfcefbcb3447740b16ff90aafe2f964", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215264, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "066dfb37d4796d864268ecb2ac31bf2d9bbb198fbe52e8116e84b313c5ef8cc3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180416, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "c91fe79b9b07d5500d3dca5b4ec693536f554161ebea84bc0cc35c1c5305719e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 215424, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s6_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "430b1618488dca7c3a0c1103672d225bb42540183cbdc6aaaca9c3fb0ab90bee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "e90e49645d59026fc30ab69317e73714dbdd4b6a399fe738716952143033efbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 180576, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "7a7256fea68ddaa4b293bb4eecce76929227705b660dbac2f6142e527d1a1a6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229600, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "06b4253dd720c865abe2d26ea4b670d39be6987ce3866c8dc29fabd01dddcbac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145728, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x8x128_s4_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "8ef88bfc762e3f28dfc254bbcf85c49237496e41ee8abbee46f6b1b063faa8c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 229760, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s6_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "3010d34b8064dbea5bc4f299a0e9639becf87ce25e88c654ba8008d122a234a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192704, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "289c93b33b3ad96967eddbc4a016748f91455f25387f60b64b184dc197b8f11a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155808, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ddd611783d131d055e547daebacf3119b23514093b5d96fd32443f7712410cdb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 192864, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a4bda9246dc0189b2adf40df083739528f999c029808a628c98830828f2d8674", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217280, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "629f7a54042af94d20ef881c305fa0fadc917ffddf8c4c1a583b8599d78ef1e4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 155968, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x16x128_s4_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "02db4265364a94c190aa18acbe3f5790ecb57b7bdf113901e6d75683e686c90c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176288, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "b89d334bd87280efd4f79b1e3e9933e7192b2e707a0ae0bb8a7699d9886ff2ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 217440, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "116e7fbcf9524efd9b7fc6b15e657b5d4e8e52aef995212743e24d133ce8b4bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 176448, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x32x128_s4_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "44ef195c0e3db647ca91e1dac4f8d59470faa61d4cfae45bc66bc33a08933165", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225640, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x64x128u2_s5_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "2d37d9721071c722294a1e33ff0b6914966dc124a3eea29f7a56894ad7ed5e44", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 225640, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x64x128_s5_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "97aa9a7e150f9a53792c16087d61bc67459c8cf488ed1c1eb15136437bb71999", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x128x64u2_s8_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "d8e7a0d17ff9f14a43b4411f3da2060d58872d2b6ae5569744f8f1ec9e2dbc78", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 64
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 218568, "bmm_Bfloat16_Bfloat16Bfloat16_Fp32_t128x128x64_s8_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "661e278b9c5f7847cd27979f45f98a40ca709923af1bba77e20dbd902b46ff5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 64
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1052672)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 64
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 168
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ -1
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 64
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 64
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 73952, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "729696fd0ec6abe03170a0b17d877a63596ac354de65519e9a5d937cb0463095", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88288, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b6c4a1c9413bc38a602fd4706419119d6472acbac35fda2d0ae02f1049990bc9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 74112, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d9aa4e9c688065b42b2f5de6ca812ea153b6e5978bbca7c15a247132061b05b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88448, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "05d1b48eb5fc9fb8a415d1416110a7f6b1617c8bff445f2242ee9f9bca7c9e5d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76008, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "01205e8beb32ef883e71348f2d8b0bba9d7419f6219e63d4b3045d418ed722f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 116960, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c30e724cca6b19841b1e43e0cc5247c61b19b5803cedf04b82aa7615eca716b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76168, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "246ae175f44bbdfba4fe7b09c35f6544af50062c7475fee136a8a3eba80c12c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92392, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "072c01427d245965676b47d68efcb461fadd26888f0c648b63d877dfe67e4dcc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117120, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "09d0425d972b3d017a53308796d3b0cb84e7e3bf17f7f0cedfd97a1f58b574ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174304, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8baeb5047588e226d80bad3c59a0a18bca9b44380a162141895c5380649e30a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92552, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d9a15fca554bd4c1117d113a53ec32e85a6a76f62ca2ff0ead3731b3352aea93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125160, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ee5dd5c136cc879f4590e8c09ff11b900f87bccdad9628b3e3f49d5f38e0d0ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174464, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fce2b2cfde0747a26f6d196392ebd88d7c54b2f72e5734d73542b5df53d260fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 73952, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6876bede52699ee93a4a5148ce1a38c43363e957ddba1801148bdfb906535a65", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125320, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "77b69a17c77e8dbb64a88dc74eae9d440e2d1ae660bde3735e698bcc12f52126", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88288, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c19a2aa7641f4c2f8f7eb1a2221122955b1aecc910550d1bb500cd682df0a26e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 74112, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8884890537ea0d697902393b841cd2a4fb7f73aef83e4cadedfdd99b2394dd6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76008, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5c70d674d9b561b36e2a7b9a9bbb7a824a7e07812eabf23a4263b8c685f4af7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88448, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "513f2d5aa98fd54041a37d038f76ca108c322c9cfc46b5c7a7049610887b0f63", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 116960, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8821716a949d05a575a1a7e6795cecd7f8854a46f02a068e435ee9a599a51a83", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76168, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "568aaf081aea6780af4f1ebb5dd86917db1ab9f2727d6a4e4bbafd1d0e918702", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117120, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5e7d0440af7c7f6742822977d43fad9dcedd1ffdae30122abbc0658c32b9c176", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92392, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bcb661b993fa70edecb812bf49151b26b166c6ce50fbd4c464e97543b4439dd8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174304, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "64b8a555a8c2181385da65d617425cf32746f431385a9bb6467a9ba742cb660d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92552, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "25765701cae658ec13a25fc55246cd22b9e03b557ae6b329dc7df5e508df9574", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125160, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "96b05a51a992a1804eec64a4e41268cf1f5d84661b3a873fea76eaf3fe891635", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174464, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7a9d8290d91de3a097783fb702f2e3c4230c2b97c0a8d911cd94b2e4e8e8bed1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125320, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "068e42e2942b3b0a55d63edea9a395dab280e0e44b6a165b2de7e25088ac3033", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124200, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128u2_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5a79edfde645fda1cee6dc36447b4074e74dddbce1db2a4a126eeb411c534815", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124200, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "018624ad92c5045bf2e0f14ca30cff11501d80b0f39dc0cad20c62eb5af6f2ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124360, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128u2_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7063640c01bd2c9467ba53f36b7f11848dc4e3dd3b45cfee2b5654d896868665", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 73952, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "dc89d79718b2832d0767dae4a57198a4c59a28aeeafda0361dfd02fc32785bfe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124360, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "1a51e71d1a1748af1753ffd7fd3cd84a9f7c9f64e501a457a6e56056fa8fa913", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 1
, /* mGridWaitForPrimaryRouting */ 1
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(2)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88288, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "88e76329d917d697b088065c5416ea42e187c7d918cdcc3f9c168030c886edc5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 74112, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "85982a4980934ffb106b9715f8ea130f2ff2f17a2bc8c20d988482e79ad34d98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88448, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1f5b41beffa6e1df45148213d065dbaeff754871271a93f043c0dda5c82c5fee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76008, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "7b9acaf70bf27b6d15521c1823ed41f1d2ed430789b4a9f340000c680e33887b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 116960, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "40c2f1799a94ca1b649a656553f8101321491e7207cae3abc7fb32d874bf94c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76168, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256u2_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3ab1f8f6a7fcf04f761678fb634222ce0ce92820a28624166b2e5e44ebbcf6f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117120, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3e2b1d9210f3fdc7c06e530102eb9be12b464ca366b4c48020a9cd552a01e0da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92392, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "25e399adef1334e96924fe67ff816348c168f9177a27531f816f97375c37e679", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174304, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "af09489c6f4ebb3a20e7592d7c77bc9a95292dfe6b4f911ccec0e886e28ce553", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92552, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256u2_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a26363a5e9dc8d1890bf23b412404c3b8b8ee2b788398003c8ece2dde3f1a39f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125160, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "abb2483d3b72d7645f7816f1a31e8267c0311ffe4c35b3a6cb658cacec41bbec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174464, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "af95530d871d7fa7d525f45dd0f1636b0eddad8e9ba851a9648899848e691ee4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 73952, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "05154e51043b354b62a5f09f6853b596df75a5ba8288cf457dbd54edd918bf7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125320, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256u2_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "41226de3ff7e38cfdfa5fec14a0bff83311f02dd0cbf9671b52ce5f1c9702060", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 512
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 74112, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "29e49201318683bf84af8c092715b2db4445d855cb4ae9c9592ccabd092156eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88288, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "90a3237f443843d313aa3cc97a4c2fe423fc30428c051892e1dc5c8dc3a842ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76008, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "024ef0dfc3ee9acbb70aff47f0db074282198b4c19ca53b21e7ff92843318dcf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88448, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6ee870b05c0dd66072995af65ade502c4062a761289526abd9fd61e47482b724", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 76168, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x16x256_s3_et128x16_m256x16x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "06528a095d7ab4aa307fd9ac27c674f148f2311300b91003ef0091a48a6025f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 116960, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "326479c1ebdb0935b0dea96b42f017cd4cfed1c6625d3aa261292615848505c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 117120, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3d6816ee8e87c59e174c9294380987b84029190be017a54be545e9ca7f0b5ca8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92392, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f90932dcd6b5ce86f81088f90a3da860874d45c88794247fb1a74d37bc0d2056", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 92552, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x32x256_s3_et128x32_m256x32x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "68567fb25bbba6dbcfed20f7eaa5ffbdaef0e65833f16d178966cbc02b7e6a60", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174304, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "8b66ac7a686d48a0cf93d3acf874199711e237a7e6bfdb6ef35ab874ce5cebf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125160, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "579c0c3a472c5d84cbb7061cf5808bb2b4b298f12d3a0acf46a076350425bd29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 174464, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "52b5ec3b794bc9384e17619a1edaaf582f0579122e85dd764cc7655e8f09aa7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124200, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128u2_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ae92ea7614e52f054aaf4b58de9dffc3410b553f95a086844bd546dcea4b2a5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 125320, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x64x256_s3_et128x64_m256x64x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "655f695541c5b556ebe38dcce17ceff32333651c41d3e61c93f5e1ae207a5d3c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124200, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "848fa420187ca2d5eceeb87d27860285566d3dbe712812d99aa2181e33b29dec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124360, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128u2_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "09c36037f7359fbcffbe04fc76ed68e7676c915a59d34c6f30a3dc645f12563c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 256
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 124360, "bmm_Bfloat16_MxInt4Bfloat16_castBfloat16_Fp32_bA32_t128x128x128_s4_et128x64_m256x128x16_cga2x1x1_16dp256b_rM_BN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "219cc14e28d20e1478146ef5fd34092457bd09e00b31d34010ea2643bf11f0f6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 256
, /* mClusterDimX */ 2
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17892366)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEltwiseActType */ gemm::EltwiseActType(0)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mFuseUtccpWithUtcmma */ 0
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 128
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 256
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumEpilogueWarps */ 4
, /* mNumRegsCastAWarps */ 112
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsCopySparsityInfo */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 160
, /* mNumRegsPerThreadNonEpilogueWarp */ 96
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ 32
, /* mSfBlockSizeB */ -1
, /* mSfBlockSizeC */ -1
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSparsityA */ trtllm::gen::Sparsity(0)
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseMaxTmemOverlap */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrix */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mValidM */ 256
, /* mValidN */ 256
, /* mValidK */ 128
, /* mWorldSize */ 1
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mBatchStrideInTokens */ -1
, /* mFusedAct */ 0
, /* mGridWaitForPrimaryRouting */ 0
, /* mIsStaticBatch */ 0
, /* mIsUniformNumTokensPerBatch */ 0
, /* mNumBatches */ 128
, /* mNumRegsPerThreadLoadA */ 0
, /* mNumRegsPerThreadLoadB */ 0
, /* mNumRegsPerThreadLoadSfA */ 0
, /* mNumRegsPerThreadLoadSfB */ 0
, /* mNumTokens */ 2
, /* mNumWarpsLoadA */ 0
, /* mNumWarpsLoadB */ 0
, /* mNumWarpsLoadSfA */ 0
, /* mNumWarpsLoadSfB */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)}
, /* mUseTmaOobOpt */ 1
 }, gemm::SmVersion::Sm100f},
#endif // EXCLUDE_SM_100
};
// clang-format on
} // namespace kernels
} // namespace tensorrt_llm

}

