/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
* AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


#pragma once

#include <flashinfer/trtllm/gemm/trtllmGen_gemm_export/GemmOptions.h>

namespace gemm { 


namespace tensorrt_llm
{
namespace kernels
{
// clang-format off

#define TLLM_GEN_EXPORT_VERSION "7.0"


static const gemm::GemmConfig tllmGenGemmList[] = {
#ifndef EXCLUDE_SM_100
{nullptr, 0, 128000, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s7_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "c34d83ebf755d3cbb956250103e75d185075e89a62a22889ffe98fad2bb2fc01", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 128000, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s7_et128x8_m128x8x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "51551e78e354340b9873c492a59f40c2071306161d84d3230f3b5db0bc39dfa5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 384
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145408, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "603dfb6c0660f4b64a2231874b084b35bdc24a2d653a3d850c855b79682431ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "958ba566c61e1228dd6c367b9183faefaaf342da1fb84eaf02402066c590b9fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 768
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135168, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s7_et128x16_m128x16x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "cb33072060c241db06bc6723ebd26c9d5f83dd7b6510dcea17b7912c6b6ab1aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 135168, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s7_et128x16_m128x16x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "1cf385af67b0eb3be318667471bd55d6a2697e82dcda3dc015996f3b80a2eab7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 384
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 145408, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "14137508ca49937dc6567bc234a6542cc438f005935248c967427629a0fb882e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 768
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "f310ec11067b04d25c9cc0ec861638a528f9854464de105c3e3aaa65852707f3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s9_et128x32_m128x32x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "b2119b0b04f9a3b9ef2bf62b272297b2074bb52029be97df6b3cad34e5a3facc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 9
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s9_et128x32_m128x32x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "2d96b6b30b9ee0db6a8cf10c94ec196ecddc84d7f1004fccac2e973ef61d21d7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 384
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 9
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 210944, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "d97f8fe55e173da81fabf3bd050612fd793b98685ab452e1dd4769d47fdcd85c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 768
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 210944, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "b41eb46c1461068e7ef6544a0699b826273a6d94dd8655f682af0e8ea81a9e04", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 178176, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s7_et128x64_m128x64x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "3ffc255b82f4a22b5760f56ca25d6fce38aa5f7fbfdd170911fe083a7002c173", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 384
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 178176, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s7_et128x64_m128x64x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "835cf4d9f52a59cd045286f551012e6caf06433280a9caebf4a88bc9b7680fd7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 256
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 7
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153600, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x3_16dp256b_rM_splitK3_BN_transOut_schedS_sm100f", 224, "2926ed79afdbc951fa1b353022efb7ab40e9bb854fd8b6c7d33ed5f2f2c41dc8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 3
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 768
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 3
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 153600, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x2_16dp256b_rM_splitK2_BN_transOut_schedS_sm100f", 224, "4bbcf69eb4553d3564649ce2d6596e26bdd25ca653d88f1c7c0acca226fa8284", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 512
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 121856, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_sm100f", 384, "e7089761ff86ef205cefc8cad9dc7f7571386d2981e5aa272b0aa5e6e7341ae0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 113664, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s6_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP2x2x1x3_sm100f", 416, "7e75019ba00a35042af127c6f69ea05f467847f8f91e708e1d45dddd73956114", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 138240, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_sm100f", 384, "be74ac9cedd4f0170519f8809a745d9694713dc09d0bb5fb600ccd813dff982f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 138240, "gemm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP2x2x1x3_sm100f", 416, "4251be95112d7c95a6d0f715104bb8795d0c56f24ba5aa8d062d11f4bbc22cc9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s4_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "7bd3e993344a2f49ee6c24eb998ff36222a83febc8020b13c61a54c6ca23784c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s4_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "71a6db88913432e8dc105e3fdeb2fe9b32575eaf9e5153f08b50f48c67082dcf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s4_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_sm100f", 352, "20f3c775b8a7f30a5c6847596de9a002d46c807c0093ded823e98830d11c8c1c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 189440, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s4_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_sm100f", 352, "aebba5055f64443ca6c18acc6e261d1c99d373d4fbe77905c2117fab8ddae4ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "3c29a4b3b68f338257a848b6f339d7be255467dd7a414368162ea5eb16be864c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 190464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "83aaf5cd17e4ab57cbd75975f0e5b61f10a56efa890077c0e19194d41a0c0cc0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 136192, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s3_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "cabb26dda044c3b1f4f64bbb3eb594c3a6bdc6cb35d37303553c63f0718b84c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 62464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s2_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "63a13b692b6b1628f72a447e92c4c14be4f435476aacc7cbd1cf60b0a45c7779", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 136192, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s3_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "c684fbe252ca340f4005266172a54bd9a7eae9fe962ff75d89d1c89ce374dadd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 62464, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s2_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "a7b0c8250e5cce62ee9e5c25b665cf3bdd2393cc265a1c101bae437f834447d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 178176, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "de114b6b3546f9056d30384eb752885dfa223d997801f103438cb66d74ad656c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 178176, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "dabc5a9706fedc2c75ef6991c99f00920769cecdc33ec7d3eb32c97fdbb4f6fd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 154624, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "6924777125b5fe2651990cd3cb6632a3e3c47850784e0a30fa3af307bd9f1d36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 166912, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "624411f97f15e60929fb78657c934ca892792f81861fd65f545bf8ab77a38b79", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 154624, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "dd159b642849c99b1447d9dc0133943911c0f61adca7249d64ed6738907c5b00", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "020f6abaab06d736f96d1ca1a2ae3addf54e97fad5fe893bc100145eb59c11a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 166912, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "8a7142bd1ce30b50d5a682128020d7d1e4628c1a5f9842064a5e5656d363a737", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "187720d538d47eb01164e79d51ae260295fe5e6776a7bd4038ffed0a1d45c682", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 126976, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s3_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "4dfc53dc0dad59cf303cf35539c639fc6f6a3f04a13a002eb406da87367eaf4a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 126976, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s3_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "65cc0ed94ddf27cd046789b7189e5981315be87d86af315c2d3840dcb56ccb26", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 49152, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s2_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "766faf547cf1428cb0df3bcbea2a7f6543ae1ad4708e7bf840348b8df38ceec8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 49152, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s2_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "1efe8c530186cc47f1f3ddd4d23f4201aecb7e40844cb95ac7eaa65e77b8ffc3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88064, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s2_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "1032a9f4753cc10d30e01adb396a4cdfff1e04a790d809342cdf3e1054be3177", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 88064, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s2_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "0836be01d5159f6a979258b37f7eb287c82bf5f1d3b59a88de06b69af3198513", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115712, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s2_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "4bb935cb1ba167b1c0f4746b080b74794b527fe827fc717183dd59ede62ae694", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 115712, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s2_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "631d9b90df69319df660053c8a73bb1a2dba0f82287820bc47aab662ea321b8a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 107520, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "dee5748ec7d5ffba9e827cff9d99af494dde1b6cbaccd1c2bbf2e1eab2ed137e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 107520, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "b310e9f13adb0a3eaa630a10003552843ebcf5f7a4867de03536f54c70f52398", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 256
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 95232, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s2_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "26b47189666223201cce28716c48e304ebaaf7619d079f12c2e996e3516440dc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 95232, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s2_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 448, "c869a7a7cd39b0ff2382133ad30ad0e742870e3b523df36ae95e21ef146ed57d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 512
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_s6_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "3222ab636c71f63009cd8ec48eb6a93ebd018b9fd64ef21eb442896b60c310b2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 152576, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_s6_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "9e545304edf3b3e79f1f9c1f8b896f48a469f77f84b550811bde0ca627bcbc05", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 97280, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128u2_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "d1ba98574133f98563868786eacd88f5a22b2ade30784a934e3ba92eee5ccb97", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 8192
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 1
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
{nullptr, 0, 97280, "gemm_Bfloat16_E2m1E2m1_Fp32_t128x128x128_s3_et128x128_m128x128x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_sm100f", 320, "72db3bf0019c20300628774cb6300c9b3d7112a1e3c76f369af77dff89c0bf3f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mCtaSwizzleType */ gemm::CtaSwizzleType(0)
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 128
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 0
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 1
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 4096
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 128
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumRegsCastAWarps */ 0
, /* mNumRegsCopySfLdsSttm */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mSfBlockSizeA */ std::nullopt
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTileK */ 128
, /* mTileM */ 128
, /* mTileN */ 128
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mTransposeMmaOutput */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseDeepSeekFp8 */ 0
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseShuffledMatrixA */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mUseUnrollLoop2xForMma */ 0
, /* mWorldSize */ 1
 }, gemm::SmVersion::Sm100f},
#endif // EXCLUDE_SM_100
};
// clang-format on
} // namespace kernels
} // namespace tensorrt_llm
}
