/*************************************************************************************************** * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include #include #include #include #include namespace cute { namespace SM120::BLOCKSCALED { template CUTE_HOST_DEVICE constexpr void mma_unpack(MMA_Traits const& traits, Tensor & D, Tensor const& A_zipped, Tensor const& B_zipped, Tensor const& C) { static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); static_assert(is_rmem::value, "Expected registers in MMA_Atom::call"); // Register value types from the MMA_Operation register arrays using RegTypeD = typename remove_extent::type; using RegTypeA = typename remove_extent::type; using RegTypeB = typename remove_extent::type; using RegTypeC = typename remove_extent::type; using RegTypeSFA = typename remove_extent::type; using RegTypeSFB = typename remove_extent::type; constexpr int RegNumD = extent::value; constexpr int RegNumA = extent::value; constexpr int RegNumB = extent::value; constexpr int RegNumC = extent::value; constexpr int RegNumSFA = extent::value; constexpr int RegNumSFB = extent::value; auto [A, SFA] = unzip_tensor(A_zipped); auto [B, SFB] = unzip_tensor(B_zipped); using Shape_MNK = typename MMA_Traits::Shape_MNK; constexpr int SFVecSize = MMA_Traits::SFVecSize; // Assert logical size CUTE_STATIC_ASSERT_V(size(SFA) == size<2>(Shape_MNK{})); CUTE_STATIC_ASSERT_V(size(SFB) == size<2>(Shape_MNK{})); // Assert physical size CUTE_STATIC_ASSERT(decltype(cosize(layout(SFA))){} == size<2>(Shape_MNK{}) / SFVecSize); CUTE_STATIC_ASSERT(decltype(cosize(layout(SFB))){} == size<2>(Shape_MNK{}) / SFVecSize); Tensor rA = recast(A); Tensor rB = recast(B); CUTE_STATIC_ASSERT_V(size(rA) == Int{}); CUTE_STATIC_ASSERT_V(size(rB) == Int{}); Tensor rD = recast(D); Tensor rC = recast(C); CUTE_STATIC_ASSERT_V(size(rD) == Int{}); CUTE_STATIC_ASSERT_V(size(rC) == Int{}); Tensor rSFA = recast(filter_zeros(SFA)); Tensor rSFB = recast(filter_zeros(SFB)); CUTE_STATIC_ASSERT_V(size(rSFA) == Int{}); CUTE_STATIC_ASSERT_V(size(rSFB) == Int{}); detail::explode(MMAOp::fma, rD, make_int_sequence{}, rA, make_int_sequence{}, rB, make_int_sequence{}, rC, make_int_sequence{}, rSFA, make_int_sequence{}, rSFB, make_int_sequence{}); } } // namespace SM120::BLOCKSCALED //////////////////////////////////////////////////////////////////////////////////////////////////// // MMA F8F6F4 16x8x32 TN template struct MMA_Traits> : MMA_Traits { // The MMA accepts 8-bit inputs regardless of the types for A and B using ValTypeA = uint8_t; using ValTypeB = uint8_t; using ValTypeD = c_type; using ValTypeC = c_type; }; //////////////////////////////////////////////////////////////////////////////////////////////////// // MMA MXF8F6F4 16x8x64 TN template struct MMA_Traits> { // The MMA accepts 4-bit inputs regardless of the types for A and B using ValTypeA = uint4_t; using ValTypeB = uint4_t; using ValTypeD = c_type; using ValTypeC = c_type; using ValTypeSF = sf_type; constexpr static int SFVecSize = VS; using Shape_MNK = Shape<_16,_8,_64>; using ThrID = Layout<_32>; // (T32,V32) -> (M16,K64) using ALayout = Layout,Shape < _8,_2, _2>>, Stride,Stride<_16,_8,_512>>>; // (T32,V16) -> (M16,K64) using BLayout = Layout,Shape <_8, _2>>, Stride,Stride<_8,_256>>>; // (T32,V64) -> (M16,K64) using SFALayout = Layout,_64>, // Effectively 16 threads due to the 2:0 mode Stride,_16>>; // (T32,V64) -> (N8,K64) using SFBLayout = Layout,_64>, // Effectively 8 threads due to the 4:0 mode Stride, _8>>; // (T32,V4) -> (M16,N8) using CLayout = SM80_16x8_Row; }; //////////////////////////////////////////////////////////////////////////////////////////////////// // MMA MXF8F6F4 16x8x32 TN template struct MMA_Traits> { using UnderlyingTraits = MMA_Traits>; // The MMA accepts 8-bit inputs regardless of the types for A and B using ValTypeA = typename UnderlyingTraits::ValTypeA; using ValTypeB = typename UnderlyingTraits::ValTypeB; using ValTypeD = typename UnderlyingTraits::ValTypeD; using ValTypeC = typename UnderlyingTraits::ValTypeC; using Shape_MNK = typename UnderlyingTraits::Shape_MNK; using ThrID = typename UnderlyingTraits::ThrID; using ALayout = typename UnderlyingTraits::ALayout; using BLayout = typename UnderlyingTraits::BLayout; using CLayout = typename UnderlyingTraits::CLayout; // Scaling factor using ValTypeSF = sf_type; constexpr static int SFVecSize = VS; // (T32,V32) -> (M16,K32) using SFALayout = Layout,_32>, // Effectively 16 threads due to the 2:0 mode Stride,_16>>; // (T32,V32) -> (N8,K32) using SFBLayout = Layout,_32>, // Effectively 8 threads due to the 4:0 mode Stride, _8>>; }; // Transform if needed template CUTLASS_DEVICE void fp4_shift_A(MMA_Op const& op, Tensor&& tensor) { } template CUTLASS_DEVICE void fp4_shift_B(MMA_Op const& op, Tensor&& tensor) { } // For SM120 MMA F8F6F4 input fp4, the operand A/B are load from ld.matrix. // ld.matrix b4x16_p64 places FP4 data at the first four bits in each // eight-bit container, whereas MMA F8F6F4 expects the four-bit data to be in // the middle of the eight-bit container. Thus, e2m1 operands being fed // to MMA F8F6F4 must be shifted left by two bits. // 0b0000ABCD --> 0b00ABCD00 // NOTE: Same transformation is NOT needed for FP6 and FP8. template CUTLASS_DEVICE void fp4_shift_A(SM120_16x8x32_TN const&, Tensor&& tensor) { using RegisterTypeA = typename remove_extent::ARegisters>::type; if constexpr (cute::is_same_v) { cute::transform(recast(tensor), [](RegisterTypeA& v){ return v << 2; }); } } template CUTLASS_DEVICE void fp4_shift_B(SM120_16x8x32_TN const&, Tensor&& tensor) { using RegisterTypeB = typename remove_extent::BRegisters>::type; if constexpr (cute::is_same_v) { cute::transform(recast(tensor), [](RegisterTypeB& v){ return v << 2; }); } } namespace SM120::BLOCKSCALED { // Template function with scale factor needs to enmuerate types one by one, as template // arguments contatins two variadic lists, which cannot be deduced in one shot. template CUTLASS_DEVICE void fp4_shift_A(SM120::BLOCKSCALED::SM120_16x8x32_TN_VS const&, Tensor&& tensor) { using RegisterTypeA = typename remove_extent::ARegisters>::type; if constexpr (cute::is_same_v) { cute::transform(recast(tensor), [](RegisterTypeA& v){ return v << 2; }); } } template CUTLASS_DEVICE void fp4_shift_B(SM120::BLOCKSCALED::SM120_16x8x32_TN_VS const&, Tensor&& tensor) { using RegisterTypeB = typename remove_extent::BRegisters>::type; if constexpr (cute::is_same_v) { cute::transform(recast(tensor), [](RegisterTypeB& v){ return v << 2; }); } } } } // end namespace cute