/* * Copyright (c) 2025 by FlashInfer team. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include "cute/tensor.hpp" #include "cutlass/kernel_hardware_info.h" namespace flat::collective { using namespace cute; template CUTE_DEVICE void gemm_reset_zero_acc(Atom& atom, TA const& tA, TB const& tB, TC&& tC) { constexpr int rA = decltype(rank(tA))::value; constexpr int rB = decltype(rank(tB))::value; constexpr int rC = decltype(rank(tC))::value; if constexpr (rA == 2 && rB == 2 && rC == 1) { CUTE_UNROLL for (int k_block = 0; k_block < size<1>(tA); k_block++) { cute::gemm(atom, tA(_, k_block), tB(_, k_block), tC); atom.accumulate_ = GMMA::ScaleOut::One; } } else { static_assert(rA == 3 && rB == 3 && rC == 3); CUTE_UNROLL for (int k_block = 0; k_block < size<2>(tA); k_block++) { cute::gemm(atom, tA(_, _, k_block), tB(_, _, k_block), tC); atom.accumulate_ = GMMA::ScaleOut::One; } } } template CUTE_DEVICE void gemm_zero_acc(Atom& atom, TA const& tA, TB const& tB, TC&& tC) { atom.accumulate_ = GMMA::ScaleOut::Zero; gemm_reset_zero_acc(atom, tA, tB, tC); } template