/* * SPDX-FileCopyrightText: Copyright (c) 2011-2024 NVIDIA CORPORATION & AFFILIATES. All rights * reserved. SPDX-License-Identifier: NVIDIA TensorRT Source Code License Agreement * * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual * property and proprietary rights in and to this material, related * documentation and any modifications thereto. Any use, reproduction, * disclosure or distribution of this material and related documentation * without an express license agreement from NVIDIA CORPORATION or * its affiliates is strictly prohibited. */ #pragma once #include #include "fmha/fragment.h" #include "fmha/utils.h" namespace fmha { //////////////////////////////////////////////////////////////////////////////////////////////////// struct Sum_ { enum { IS_SUM = 1 }; static inline __device__ float apply(float x, float y) { return x + y; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// struct Max_ { enum { IS_SUM = 0 }; static inline __device__ float apply(float x, float y) { return fmaxf(x, y); } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template inline __device__ float apply_exp_(float x, float max) { return isinf(x) ? 0.f : __expf(x - max); } //////////////////////////////////////////////////////////////////////////////////////////////////// template <> inline __device__ float apply_exp_<2>(float x, float max) { return __expf(x - max); } //////////////////////////////////////////////////////////////////////////////////////////////////// template inline __device__ float get_alibi_head_scaling_factor(int const in_head_id, AlibiParams const& params) { int const head_id = params.head_idx_offset + in_head_id; if (head_id < params.h_pow_2) { // 2^(head_id * -8 / h) return exp2f((head_id + 1) * 2 * params.alibi_neg4_div_h) * params.scale_after_alibi; } else { // 1,3,5... etc float const adjusted_head_id = 2 * (head_id - params.h_pow_2) + 1; // 2^(adjusted_head_id * -4 / h) return exp2f(adjusted_head_id * params.alibi_neg4_div_h) * params.scale_after_alibi; ; } } //////////////////////////////////////////////////////////////////////////////////////////////////// template struct ReadType { using T = float; }; template <> struct ReadType<4> { using T = float; }; template <> struct ReadType<8> { using T = float2; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Smem_tile_reduce { // Helper class to distribute MMA tiles reduced over rows per warp over quads. // The Mma tile. using Mma_tile = typename Traits::template Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; enum { WARPS_M = Cta_tile::WARPS_M }; enum { WARPS_N = Cta_tile::WARPS_N }; static constexpr int ROWS = WARPS_M * MMAS_M * 16; static constexpr int COLS = WARPS_N; static constexpr int ROWS_PER_XOR_PATTERN = (COLS == 8) ? 4 : 8; static constexpr int BYTES_PER_TILE = ROWS * COLS * sizeof(float); static constexpr int ELTS_PER_TILE = ROWS * COLS; static constexpr int THREADS_PER_GROUP = Kernel_traits::Gmem_tile_o::THREADS_PER_ROW; static constexpr int ROWS_PER_WARP = 32 / THREADS_PER_GROUP; static constexpr int LOOPS = Kernel_traits::Gmem_tile_o::LOOPS; using read_t = typename ReadType::T; __device__ inline Smem_tile_reduce(float* smem_, int const tidx) { int lane = tidx % 32; int warp = tidx / 32; int warp_m = warp % WARPS_M; int warp_n = warp / WARPS_M; qid_ = lane % 4; int qp = lane / 4; // Swizzle the column to avoid 2-fold bank conflicts when we have 8 warps. // This won't affect reading as we assume commutative reduction ops. int const col = warp_n ^ (qp / ROWS_PER_XOR_PATTERN); smem_write_ = &smem_[warp_m * 16 * MMAS_M * WARPS_N + qp * WARPS_N + col]; smem_read_ = &reinterpret_cast(smem_)[warp_m * 16 * MMAS_M * 4 + qp * 4 + qid_]; } __device__ inline void store(float (&frag)[2 * MMAS_M]) { if (qid_ == 0) { #pragma unroll for (int mi = 0; mi < MMAS_M; mi++) { int offset = mi * 16 * WARPS_N; smem_write_[offset + 0 * 8 * WARPS_N] = frag[mi * 2 + 0]; smem_write_[offset + 1 * 8 * WARPS_N] = frag[mi * 2 + 1]; } } } __device__ inline void load(read_t (&frag)[2 * MMAS_M]) { #pragma unroll for (int mi = 0; mi < MMAS_M; mi++) { int offset = mi * 16 * 4; frag[mi * 2 + 0] = smem_read_[offset + 0 * 8 * 4]; frag[mi * 2 + 1] = smem_read_[offset + 1 * 8 * 4]; } } int qid_; float* smem_write_; read_t* smem_read_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_base { // The Mma tile. using Mma_tile = typename Traits::template Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // The number of groups of warp such that we have at most 4 warps writing consecutive elements. enum { GROUPS = fmha::Div_up::VALUE }; // The number of elements that we are going to store per row. enum { ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS }; // The number of rows. enum { ROWS = Cta_tile::M * GROUPS }; // The total number of elements. enum { ELEMENTS = ROWS * ELEMENTS_PER_ROW }; // If shared memory is used enum { USE_SHARED_MEMORY = Cta_tile::WARPS_N > 1 }; // DEBUG. static_assert(ELEMENTS == Cta_tile::M * Cta_tile::WARPS_N, ""); // END OF DEBUG. // The number of rows per thread. enum { ROWS_PER_THREAD = MMAS_M * 2 }; // Ctor. template inline __device__ Softmax_base(Params const& params, void* smem, int bidb, int tidx) : smem_(reinterpret_cast(smem)), tidx_(tidx) { // Extract the position in the warp. int warp = tidx / Cta_tile::THREADS_PER_WARP; int lane = tidx % Cta_tile::THREADS_PER_WARP; // Decompose the warp index into M and N. int warp_m = warp % Cta_tile::WARPS_M; int warp_n = warp / Cta_tile::WARPS_M; // Decompose the warp-n index into group/position-inside-the-group. int warp_g = warp_n / ELEMENTS_PER_ROW; int warp_i = warp_n % ELEMENTS_PER_ROW; // The location written by the threads. int write_row = warp_g * Cta_tile::M + warp_m * Mma_tile::M_PER_MMA + lane / 4; int write_col = warp_i; // Assemble the write pointer. smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col]; // Assemble the read pointer. smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + lane / 4]; } // Apply mask before softmax. Use 1 byte per MMA distributed as 2x4. template inline __device__ void apply_mask(Mask const& mask) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < 2; ++ii) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int jj = 0; jj < 4; ++jj) { if (!mask.is_valid(mi, ni, ii, jj)) { elt_[2 * mi + ii][4 * ni + jj] = -FLT_MAX; } } } } } } template inline __device__ void apply_mask_alibi(Mask const& mask, int head_id, AlibiParams const& alibi_params) { // 'if constexpr' because ALiBi is only defined for causal masks if constexpr (Kernel_traits::CAUSAL_MASK) { float m = get_alibi_head_scaling_factor(head_id, alibi_params); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < 2; ++ii) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int jj = 0; jj < 4; ++jj) { int row, col; mask.get_row_col(row, col, mi, ni, ii, jj); if (mask.is_valid(row, col)) { // Since softmax is shift invariant, // it is sufficient just to use the column as the multiplier elt_[2 * mi + ii][4 * ni + jj] = elt_[2 * mi + ii][4 * ni + jj] * alibi_params.scale_after_alibi + m * (col + alibi_params.sequence_pos_offset); } else { elt_[2 * mi + ii][4 * ni + jj] = -FLT_MAX; } } } } } } else { __builtin_unreachable(); } } // Apply the mask to unpacked data. inline __device__ void apply_mask(uint32_t const (&packed_mask)[MMAS_M]) { // This code works only if we have MMAS_N <= 4. static_assert(MMAS_N <= 4, ""); // Expand the mask. int mask[MMAS_M * 2][MMAS_N * 4]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { mask[2 * mi + 0][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 0)); mask[2 * mi + 0][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 1)); mask[2 * mi + 1][4 * ni + 0] = packed_mask[mi] & (1u << (8 * ni + 2)); mask[2 * mi + 1][4 * ni + 1] = packed_mask[mi] & (1u << (8 * ni + 3)); mask[2 * mi + 0][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 4)); mask[2 * mi + 0][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 5)); mask[2 * mi + 1][4 * ni + 2] = packed_mask[mi] & (1u << (8 * ni + 6)); mask[2 * mi + 1][4 * ni + 3] = packed_mask[mi] & (1u << (8 * ni + 7)); } } // Apply the mask. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { if (!mask[mi][ni]) { elt_[mi][ni] = -FLT_MAX; } } } } // Mask the elements that are outside the the sequence length. inline __device__ void apply_mask(int const actual_seqlen) { // The warp/lane decomposition. int const warp = threadIdx.x / Cta_tile::THREADS_PER_WARP; int const lane = threadIdx.x % Cta_tile::THREADS_PER_WARP; // The warp in the n dimension. int const warp_n = warp / Cta_tile::WARPS_M; // The position within a quad. int const quad_lane = lane % 4; #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Determine the position in the sequence. int const offset = ni * Mma_tile::N_PER_MMA_PER_CTA + warp_n * 16; if (offset + 0 + 2 * quad_lane >= actual_seqlen) { elt_[mi][4 * ni + 0] = -FLT_MAX; // 0 } if (offset + 1 + 2 * quad_lane >= actual_seqlen) { elt_[mi][4 * ni + 1] = -FLT_MAX; // 1 } if (offset + 8 + 2 * quad_lane >= actual_seqlen) { elt_[mi][4 * ni + 2] = -FLT_MAX; // 8 } if (offset + 9 + 2 * quad_lane >= actual_seqlen) { elt_[mi][4 * ni + 3] = -FLT_MAX; // 9 } } } } // Apply the exp to all the elements. inline __device__ void apply_exp(float const max) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { elt_[mi][ni] = apply_exp_(elt_[mi][ni], max); } } } // Apply the exp to all the elements. inline __device__ void apply_scale_exp(float const (&max)[MMAS_M * 2], float scale_bmm1) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { elt_[mi][ni] = apply_exp_(scale_bmm1 * elt_[mi][ni], max[mi]); } } } // Apply the exp to all the elements. inline __device__ void apply_exp(float const (&max)[MMAS_M * 2]) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]); } } } // Do a warp-wide reduction. template inline __device__ void reduce_Nx1(float (&dst)[MMAS_M * 2]) { #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { float tmp[2] = {0.f, 0.f}; #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { tmp[0] += elt_[mi][4 * ni + 0] + elt_[mi][4 * ni + 1]; tmp[1] += elt_[mi][4 * ni + 2] + elt_[mi][4 * ni + 3]; } dst[mi] = tmp[0] + tmp[1]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 4; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row inside each group of 4 threads. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1)); dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2)); } } // Do a CTA-wide reduction. template inline __device__ float reduce_2x2() { float dst[MMAS_M * 2]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { // Pair-wise adds in the different threads of the reference code (x+y and z+w). float a_01 = elt_[mi][0] + elt_[mi][1]; float a_45 = elt_[mi][4] + elt_[mi][5]; //// tmp[0/1] += __shfl_xor(2) in the reference code. a_01 += elt_[mi][2] + elt_[mi][3]; a_45 += elt_[mi][6] + elt_[mi][7]; //// tmp[0/1] += __shfl_xor(8) in the reference code. a_01 += a_45; if (MMAS_N >= 3) { float a_89 = elt_[mi][8] + elt_[mi][9]; a_89 += elt_[mi][10] + elt_[mi][11]; if (MMAS_N == 4) { float a_cd = elt_[mi][12] + elt_[mi][13]; a_cd += elt_[mi][14] + elt_[mi][15]; a_89 += a_cd; } a_01 += a_89; } dst[mi] = a_01; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 4; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row inside each group of 4 threads. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1)); dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2)); } // Store the different values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 4 == 0) { smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0]; smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 2 values (one for each warp). float2 tmp = reinterpret_cast(smem_)[tidx_]; // Compute the reduction of those 2 values in a binary-tree fashion. return Functor::apply(tmp.x, tmp.y); } // Do a CTA-wide reduction. template inline __device__ float reduce_1x4() { float dst[MMAS_M * 2]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { float tmp[2] = {0.f, 0.f}; #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { tmp[0] += elt_[mi][4 * ni + 0] + elt_[mi][4 * ni + 1]; tmp[1] += elt_[mi][4 * ni + 2] + elt_[mi][4 * ni + 3]; } dst[mi] = tmp[0] + tmp[1]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 4; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row inside each group of 4 threads. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1)); dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2)); } // Store the different values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 4 == 0) { smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0]; smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 8 values (one for each warp). The /8 corresponds to /(4*2) where 4 is from the // float4. float4 tmp[1]; if (tidx_ < Cta_tile::M) { tmp[0] = reinterpret_cast(&smem_[0 * ELEMENTS / 2])[tidx_]; } // Compute the reduction of those 8 values in a binary-tree fashion. tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y); tmp[0].z = Functor::apply(tmp[0].z, tmp[0].w); tmp[0].x = Functor::apply(tmp[0].x, tmp[0].z); // Return the final reduction. return tmp[0].x; } // Do a CTA-wide reduction. template inline __device__ float reduce_1x8() { float dst[MMAS_M * 2]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread. float tmp[MMAS_M * 2][2]; #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { tmp[mi][0] = 0.f; tmp[mi][1] = 0.f; #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { tmp[mi][0] += elt_[mi][4 * ni + 0]; tmp[mi][0] += elt_[mi][4 * ni + 1]; tmp[mi][1] += elt_[mi][4 * ni + 2]; tmp[mi][1] += elt_[mi][4 * ni + 3]; } dst[mi] = tmp[mi][0] + tmp[mi][1]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 4; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row inside each group of 4 threads. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1)); dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2)); } // Store the different values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 4 == 0) { smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 0) * ELEMENTS_PER_ROW] = dst[2 * mi + 0]; smem_write_[(mi * Mma_tile::M_PER_MMA_PER_CTA + 8) * ELEMENTS_PER_ROW] = dst[2 * mi + 1]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 8 values (one for each warp). The /8 corresponds to /(4*2) where 4 is from the // float4. float4 tmp[2]; if (tidx_ < Cta_tile::M) { tmp[0] = reinterpret_cast(&smem_[0 * ELEMENTS / 2])[tidx_]; tmp[1] = reinterpret_cast(&smem_[1 * ELEMENTS / 2])[tidx_]; } // Compute the reduction of those 8 values in a binary-tree fashion. tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y); tmp[0].z = Functor::apply(tmp[0].z, tmp[0].w); tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y); tmp[1].z = Functor::apply(tmp[1].z, tmp[1].w); tmp[0].x = Functor::apply(tmp[0].x, tmp[0].z); tmp[1].x = Functor::apply(tmp[1].x, tmp[1].z); tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x); // Return the result. return tmp[0].x; } // Do a CTA-wide reduction. template inline __device__ float reduce_() { // The result of the reduction. Threads 0..Cta_tile::M-1 own a single row value. float red = 0.f; // SEQLEN == 128. if (Cta_tile::WARPS_M == 2 && Cta_tile::WARPS_N == 2) { red = reduce_2x2(); // SEQLEN == 256. } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 4) { red = reduce_1x4(); // SEQLEN == 384. } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 8) { red = reduce_1x8(); // Not supported. } else { assert(false); } return red; } // Finalize the reduction. inline __device__ void shuffle(float (&dst)[MMAS_M * 2], float red) { // Store the value back to shared memory. if (tidx_ < Cta_tile::M) { smem_[tidx_] = red; } // Make sure the data is in shared memory. __syncthreads(); // Finally read the values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[2 * mi + 0] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA + 0]; dst[2 * mi + 1] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA + 8]; } // Make sure the data is in shared memory. __syncthreads(); } // Do a CTA-wide reduction. template inline __device__ void reduce(float (&dst)[MMAS_M * 2]) { // NOTE: 1 warp along reduce direction, no syncs if (Cta_tile::WARPS_N == 1) { reduce_Nx1(dst); } else { // The result of the reduction. Threads 0..Cta_tile::M-1 own a single row value. float red = reduce_(); // Make sure we can write to shared memory. __syncthreads(); // Finalize the reduction. shuffle(dst, red); } } // Scale all the elements. inline __device__ void scale(float const (&sum)[MMAS_M * 2]) { // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal. float inv_sum[MMAS_M * 2]; #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi]; } // Update the values. #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { elt_[mi][ni] *= inv_sum[mi]; } } } // Shared memory for the CTA-wide reduction. float *smem_, *smem_write_, *smem_read_; // The current thread index. int tidx_; // The elements. float elt_[MMAS_M * 2][MMAS_N * 4]; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_hmma : public Softmax_base { // The base class. using Base = Softmax_base; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // Whether we need to skip the softmax due to the sliding-window attention // Otherwise, we will get NANs as those tokens are all masked out. enum { SLIDING_WINDOW_ATTENTION = Kernel_traits::SLIDING_WINDOW_ATTENTION }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // Ctor. template inline __device__ Softmax_hmma(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_bmm1_(params.scale_bmm1), params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // The elements. float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3]; float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3]; // Transform to accumulators. acc[mi][ni].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); acc[mi][ni].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); acc[mi][ni].reg(2) = fmha::float2_to_16bit_2(tmp_02, tmp_03); acc[mi][ni].reg(3) = fmha::float2_to_16bit_2(tmp_12, tmp_13); } } // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Convert from FP16 fragments to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Normalize the values, and clamp to finite half. uint32_t acc_0 = satfinite_h2(hmul2(acc[mi][ni].reg(0), params_scale_bmm1_)); uint32_t acc_1 = satfinite_h2(hmul2(acc[mi][ni].reg(1), params_scale_bmm1_)); uint32_t acc_2 = satfinite_h2(hmul2(acc[mi][ni].reg(2), params_scale_bmm1_)); uint32_t acc_3 = satfinite_h2(hmul2(acc[mi][ni].reg(3), params_scale_bmm1_)); // Extract the values as floats. half2_to_float2(this->elt_[2 * mi + 0][4 * ni + 0], this->elt_[2 * mi + 0][4 * ni + 1], acc_0); half2_to_float2(this->elt_[2 * mi + 1][4 * ni + 0], this->elt_[2 * mi + 1][4 * ni + 1], acc_1); half2_to_float2(this->elt_[2 * mi + 0][4 * ni + 2], this->elt_[2 * mi + 0][4 * ni + 3], acc_2); half2_to_float2(this->elt_[2 * mi + 1][4 * ni + 2], this->elt_[2 * mi + 1][4 * ni + 3], acc_3); // Attention logit softcapping scale. // 1.0f / softcapping_scale has been fused to scale_bmm1. if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { this->elt_[2 * mi + 0][4 * ni + 0] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 0]); this->elt_[2 * mi + 0][4 * ni + 1] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 1]); this->elt_[2 * mi + 1][4 * ni + 0] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 0]); this->elt_[2 * mi + 1][4 * ni + 1] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 1]); this->elt_[2 * mi + 0][4 * ni + 2] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 2]); this->elt_[2 * mi + 0][4 * ni + 3] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 3]); this->elt_[2 * mi + 1][4 * ni + 2] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 2]); this->elt_[2 * mi + 1][4 * ni + 3] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 3]); } } } } // Apply the exp to all the elements. // Need to make sure the results are zero when all elts are -FLT_MAX // as it is possible that all tokens are masked out. template inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : max[mi]; #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val); } } } // The scaling factor. uint32_t const params_scale_bmm1_; float const params_softcapping_scale_bmm1_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Fragment_helper {}; //////////////////////////////////////////////////////////////////////////////////////////////////// template <> struct Fragment_helper { // The traits. using Traits = fmha::Volta_imma_int8_int32_traits; // The fragment A. using Fragment_a = fmha::Fragment_a; // The accumulator. using Accumulator = fmha::Fragment_accumulator; // Load a 2x4 array from registers. static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) { dst[0][0] = src.elt(0); dst[0][1] = src.elt(1); dst[0][2] = src.elt(2); dst[0][3] = src.elt(3); dst[1][0] = src.elt(4); dst[1][1] = src.elt(5); dst[1][2] = src.elt(6); dst[1][3] = src.elt(7); } // Store to an accumulator. static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) { dst.reg(0) = src[0][0]; dst.reg(1) = src[0][1]; dst.reg(2) = src[0][2]; dst.reg(3) = src[0][3]; dst.reg(4) = src[1][0]; dst.reg(5) = src[1][1]; dst.reg(6) = src[1][2]; dst.reg(7) = src[1][3]; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template <> struct Fragment_helper { // The traits. using Traits = fmha::Turing_imma_int8_int32_traits; // The fragment A. using Fragment_a = fmha::Fragment_a; // The accumulator. using Accumulator = fmha::Fragment_accumulator; // Load a 2x4 array from registers. static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) { dst[0][0] = src.elt(0); dst[0][1] = src.elt(1); dst[0][2] = src.elt(2); dst[0][3] = src.elt(3); dst[1][0] = src.elt(4); dst[1][1] = src.elt(5); dst[1][2] = src.elt(6); dst[1][3] = src.elt(7); } // Store to an accumulator. static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) { dst.reg(0) = src[0][0]; dst.reg(1) = src[0][1]; dst.reg(2) = src[0][2]; dst.reg(3) = src[0][3]; dst.reg(4) = src[1][0]; dst.reg(5) = src[1][1]; dst.reg(6) = src[1][2]; dst.reg(7) = src[1][3]; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template <> struct Fragment_helper { // The traits. using Traits = fmha::Ampere_imma_int8_int32_traits; // The fragment A. using Fragment_a = fmha::Fragment_a; // The accumulator. using Accumulator = fmha::Fragment_accumulator; // Load a 2x4 array from registers. static inline __device__ void load(int32_t (&dst)[2][4], Accumulator const& src) { dst[0][0] = src.elt(0); dst[0][1] = src.elt(1); dst[0][2] = src.elt(4); dst[0][3] = src.elt(5); dst[1][0] = src.elt(2); dst[1][1] = src.elt(3); dst[1][2] = src.elt(6); dst[1][3] = src.elt(7); } // Store to an accumulator. static inline __device__ void store(Accumulator& dst, uint32_t const (&src)[2][4]) { dst.reg(0) = src[0][0]; dst.reg(1) = src[0][1]; dst.reg(4) = src[0][2]; dst.reg(5) = src[0][3]; dst.reg(2) = src[1][0]; dst.reg(3) = src[1][1]; dst.reg(6) = src[1][2]; dst.reg(7) = src[1][3]; } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_imma : public Softmax_base { // The base class. using Base = Softmax_base; // The Mma tile. using Mma_tile = typename Traits::template Mma_tile; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // The fragment. using Fragment_a = fmha::Fragment_a; // The dst type using Dst_type = typename Traits::A_type; // Ctor. template inline __device__ Softmax_imma(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_bmm1_(params.scale_bmm1), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Scale the FP32 elements. uint32_t tmp[2][4]; #pragma unroll for (int mj = 0; mj < 2; ++mj) { #pragma unroll for (int nj = 0; nj < 4; ++nj) { float f = this->elt_[2 * mi + mj][4 * ni + nj] * scale; asm volatile("cvt.rni.sat.s8.f32 %0, %1;\n" : "=r"(tmp[mj][nj]) : "f"(f)); } } // Convert to int8 and store. Fragment_helper::store(acc[mi][ni], tmp); } } // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Convert from accumulators to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const scale = reinterpret_cast(params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Load the values from the accumulator's registers. int32_t tmp[2][4]; Fragment_helper::load(tmp, acc[mi][ni]); // Convert to FP32 and scale. #pragma unroll for (int mj = 0; mj < 2; ++mj) { #pragma unroll for (int nj = 0; nj < 4; ++nj) { #if defined(USE_I2F_EMULATION_TRICK) float f = reinterpret_cast(tmp[mj][nj]); this->elt_[2 * mi + mj][4 * ni + nj] = (f - FP32_I2F_MAGIC_NUMBER) * scale; #else this->elt_[2 * mi + mj][4 * ni + nj] = static_cast(tmp[mj][nj]) * scale; #endif // defined(USE_I2F_EMULATION_TRICK) } } } } } // Repack. We could use store/load to match the Smem_tile API. (shared by Ampere IMMA and Ada // QMMA) template inline __device__ void pack(Fragment_a_ (&dst)[K][M]) { // We pack N 16x16 acc tiles into K 16x32 tiles for A. // In the 16x16 tile, a thread owns 4 elts per row (4 regs). // In the 16x32 A tile, a thread owns 8 elts per row (2 regs). // Hence we have to pack with a 2:1 ratio. // For N = 1, K is 1: pack 4 values into dst reg 0. Set reg 1 to 0. // For N = 2, K is 1: pack 8 values into dst regs 0, 1. // For N = 3, K is 2: pack 12 values into dst regs (0,0), (0,1), (1,0). Set (1,1) to 0. // For N = 4, K is 2: pack 16 values into dst regs (0,0), (0,1), (1,0), (1,1) // For N = 5, K is 3: pack 20 values into dst regs (0,0), (0,1), (1,0), (1,1), (2,0). Set (2,1) // to 0. For N = 6, K is 3: pack 24 values into dst regs (0,0), (0,1), (1,0), (1,1), (2,0), // (2,1) static_assert(K == 3 || K == 2 || K == 1, ""); float const scale = reinterpret_cast(this->params_scale_softmax_); #pragma unroll for (int mi = 0; mi < M; ++mi) { // 1st row - 12 elements per row. float tmp_00 = this->elt_[2 * mi + 0][0] * scale; float tmp_01 = this->elt_[2 * mi + 0][1] * scale; float tmp_02 = this->elt_[2 * mi + 0][2] * scale; float tmp_03 = this->elt_[2 * mi + 0][3] * scale; float tmp_04 = this->elt_[2 * mi + 0][4] * scale; float tmp_05 = this->elt_[2 * mi + 0][5] * scale; float tmp_06 = this->elt_[2 * mi + 0][6] * scale; float tmp_07 = this->elt_[2 * mi + 0][7] * scale; float tmp_08 = this->elt_[2 * mi + 0][8] * scale; float tmp_09 = this->elt_[2 * mi + 0][9] * scale; float tmp_0a = this->elt_[2 * mi + 0][10] * scale; float tmp_0b = this->elt_[2 * mi + 0][11] * scale; // 2nd row - 12 elements per row. float tmp_20 = this->elt_[2 * mi + 1][0] * scale; float tmp_21 = this->elt_[2 * mi + 1][1] * scale; float tmp_22 = this->elt_[2 * mi + 1][2] * scale; float tmp_23 = this->elt_[2 * mi + 1][3] * scale; float tmp_24 = this->elt_[2 * mi + 1][4] * scale; float tmp_25 = this->elt_[2 * mi + 1][5] * scale; float tmp_26 = this->elt_[2 * mi + 1][6] * scale; float tmp_27 = this->elt_[2 * mi + 1][7] * scale; float tmp_28 = this->elt_[2 * mi + 1][8] * scale; float tmp_29 = this->elt_[2 * mi + 1][9] * scale; float tmp_2a = this->elt_[2 * mi + 1][10] * scale; float tmp_2b = this->elt_[2 * mi + 1][11] * scale; // Pack the first 12 elements to 6 registers of 2 fragments. dst[0][mi].reg(0) = fmha::float4_to_8bitx4(tmp_00, tmp_01, tmp_02, tmp_03); dst[0][mi].reg(1) = fmha::float4_to_8bitx4(tmp_20, tmp_21, tmp_22, tmp_23); dst[0][mi].reg(2) = fmha::float4_to_8bitx4(tmp_04, tmp_05, tmp_06, tmp_07); dst[0][mi].reg(3) = fmha::float4_to_8bitx4(tmp_24, tmp_25, tmp_26, tmp_27); if (K > 1) { dst[1][mi].reg(0) = fmha::float4_to_8bitx4(tmp_08, tmp_09, tmp_0a, tmp_0b); dst[1][mi].reg(1) = fmha::float4_to_8bitx4(tmp_28, tmp_29, tmp_2a, tmp_2b); } if (Mma_tile::MMAS_N == 6) { float tmp_0c = this->elt_[2 * mi + 0][12] * scale; float tmp_0d = this->elt_[2 * mi + 0][13] * scale; float tmp_0e = this->elt_[2 * mi + 0][14] * scale; float tmp_0f = this->elt_[2 * mi + 0][15] * scale; float tmp_10 = this->elt_[2 * mi + 0][16] * scale; float tmp_11 = this->elt_[2 * mi + 0][17] * scale; float tmp_12 = this->elt_[2 * mi + 0][18] * scale; float tmp_13 = this->elt_[2 * mi + 0][19] * scale; float tmp_14 = this->elt_[2 * mi + 0][20] * scale; float tmp_15 = this->elt_[2 * mi + 0][21] * scale; float tmp_16 = this->elt_[2 * mi + 0][22] * scale; float tmp_17 = this->elt_[2 * mi + 0][23] * scale; float tmp_2c = this->elt_[2 * mi + 1][12] * scale; float tmp_2d = this->elt_[2 * mi + 1][13] * scale; float tmp_2e = this->elt_[2 * mi + 1][14] * scale; float tmp_2f = this->elt_[2 * mi + 1][15] * scale; float tmp_30 = this->elt_[2 * mi + 1][16] * scale; float tmp_31 = this->elt_[2 * mi + 1][17] * scale; float tmp_32 = this->elt_[2 * mi + 1][18] * scale; float tmp_33 = this->elt_[2 * mi + 1][19] * scale; float tmp_34 = this->elt_[2 * mi + 1][20] * scale; float tmp_35 = this->elt_[2 * mi + 1][21] * scale; float tmp_36 = this->elt_[2 * mi + 1][22] * scale; float tmp_37 = this->elt_[2 * mi + 1][23] * scale; dst[1][mi].reg(2) = fmha::float4_to_8bitx4(tmp_0c, tmp_0d, tmp_0e, tmp_0f); dst[1][mi].reg(3) = fmha::float4_to_8bitx4(tmp_2c, tmp_2d, tmp_2e, tmp_2f); dst[2][mi].reg(0) = fmha::float4_to_8bitx4(tmp_10, tmp_11, tmp_12, tmp_13); dst[2][mi].reg(1) = fmha::float4_to_8bitx4(tmp_30, tmp_31, tmp_32, tmp_33); dst[2][mi].reg(2) = fmha::float4_to_8bitx4(tmp_14, tmp_15, tmp_16, tmp_17); dst[2][mi].reg(3) = fmha::float4_to_8bitx4(tmp_34, tmp_35, tmp_36, tmp_37); } else if (Mma_tile::MMAS_N == 4) { // SEQLEN == 128. float tmp_0c = this->elt_[2 * mi + 0][12] * scale; float tmp_0d = this->elt_[2 * mi + 0][13] * scale; float tmp_0e = this->elt_[2 * mi + 0][14] * scale; float tmp_0f = this->elt_[2 * mi + 0][15] * scale; float tmp_1c = this->elt_[2 * mi + 1][12] * scale; float tmp_1d = this->elt_[2 * mi + 1][13] * scale; float tmp_1e = this->elt_[2 * mi + 1][14] * scale; float tmp_1f = this->elt_[2 * mi + 1][15] * scale; dst[1][mi].reg(2) = fmha::float4_to_8bitx4(tmp_0c, tmp_0d, tmp_0e, tmp_0f); dst[1][mi].reg(3) = fmha::float4_to_8bitx4(tmp_1c, tmp_1d, tmp_1e, tmp_1f); // SEQLEN == 384 or SEQLEN == 256. } else if (Mma_tile::MMAS_N == 3 || Mma_tile::MMAS_N == 2) { // TODO added second OR term for ampere imma s=256: correct? dst[1][mi].reg(2) = 0u; dst[1][mi].reg(3) = 0u; } else if (Mma_tile::MMAS_N == 1) { dst[0][mi].reg(2) = 0u; dst[0][mi].reg(3) = 0u; // Not implemented. } else { assert(false); } } } // The scaling factors. uint32_t const params_scale_bmm1_, params_scale_softmax_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_qmma : public Softmax_imma {}; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_qmma : public Softmax_imma { // The Traits using Traits = fmha::Ada_qmma_e4m3_fp32_traits; // The base class. using Base = Softmax_imma; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // Ctor. template inline __device__ Softmax_qmma(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_bmm1_(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // scale acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale; acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale; acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale; acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale; acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale; acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale; acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale; acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale; } } // Delegate to the gmem tile to store. // TODO: need fp32 to fp8 conversion (move this to gmem_tile) gmem_tile.store(acc); } // Convert from accumulators to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const scale = reinterpret_cast(params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Convert to FP32 and scale. this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scale; this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scale; this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scale; this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scale; this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scale; this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scale; this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scale; this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scale; } } } template inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : (max[mi] - logf(Traits::SOFTMAX_FP_QUANT_SCALE)); #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val); } } } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { float const scale = reinterpret_cast(this->params_scale_softmax_); // The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19] // Note below that this is not possible with the register layout of the accumulator. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 8 elements per row. float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale; // + 0 float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale; // + 1 float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale; // + 8 float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale; // + 9 float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale; // +16 float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale; // +17 float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale; // +24 float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale; // +25 // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale; // + 0 float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale; // + 1 float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale; // + 8 float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale; // + 9 float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale; // +16 float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale; // +17 float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale; // +24 float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale; // +25 // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float4_to_fp8x4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = fmha::float4_to_fp8x4(tmp_10, tmp_11, tmp_12, tmp_13); dst[ki][mi].reg(2) = fmha::float4_to_fp8x4(tmp_04, tmp_05, tmp_06, tmp_07); dst[ki][mi].reg(3) = fmha::float4_to_fp8x4(tmp_14, tmp_15, tmp_16, tmp_17); } } } // The scaling factors. uint32_t const params_scale_bmm1_, params_scale_softmax_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_qmma : public Softmax_imma { // The Traits using Traits = fmha::Ada_qmma_e4m3_fp16_traits; // The base class. using Base = Softmax_imma; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // Ctor. template inline __device__ Softmax_qmma(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_bmm1_(params.scale_bmm1), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // scale acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale; acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale; acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale; acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale; acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale; acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale; acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale; acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale; } } // Delegate to the gmem tile to store. // TODO: need fp32 to fp8 conversion (move this to gmem_tile) gmem_tile.store(acc); } // Convert from accumulators to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Convert to FP32 and scale. float2* elt_ptr0 = reinterpret_cast(this->elt_[2 * mi + 0] + 4 * ni); float2* elt_ptr1 = reinterpret_cast(this->elt_[2 * mi + 1] + 4 * ni); elt_ptr0[0] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(0), params_scale_bmm1_)); elt_ptr0[1] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(2), params_scale_bmm1_)); elt_ptr1[0] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(1), params_scale_bmm1_)); elt_ptr1[1] = fmha::half2_to_float2(fmha::hmul2(acc[mi][ni].reg(3), params_scale_bmm1_)); } } } // The scaling factors. uint32_t const params_scale_bmm1_, params_scale_softmax_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax {}; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax { // The traits class. using Traits = fmha::Volta_hmma_fp16_traits; // The Mma tile. using Mma_tile = typename Traits::template Mma_tile; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // The fragment. using Fragment_a = fmha::Fragment_a; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // The number of groups of warp such that we have at most 2 warps writing consecutive elements. enum { GROUPS = fmha::Div_up::VALUE }; // The number of elements that we are going to store per row. enum { ELEMENTS_PER_ROW = Cta_tile::WARPS_N / GROUPS }; // The number of rows. enum { ROWS = Cta_tile::M * GROUPS }; // The total number of elements. enum { ELEMENTS = ROWS * ELEMENTS_PER_ROW }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; // If shared memory is used enum { USE_SHARED_MEMORY = Cta_tile::WARPS_N > 1 }; // The number of rows per thread. enum { ROWS_PER_THREAD = MMAS_M }; // DEBUG. static_assert(ELEMENTS == Cta_tile::M * Cta_tile::WARPS_N, ""); // END OF DEBUG. // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : params_scale_bmm1_(params.scale_bmm1), params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1), smem_(reinterpret_cast(smem)), tidx_(tidx) { // Extract the position in the warp. int warp = tidx / Cta_tile::THREADS_PER_WARP; int lane = tidx % Cta_tile::THREADS_PER_WARP; // Decompose the warp index into M and N. int warp_m = warp % Cta_tile::WARPS_M; int warp_n = warp / Cta_tile::WARPS_M; // Decompose the warp-n index into group/position-inside-the-group. int warp_g = warp_n / ELEMENTS_PER_ROW; int warp_i = warp_n % ELEMENTS_PER_ROW; // The row written/read by the thread (threads i and i+8 are on the same row). int row = (lane & 0x10) / 2 + (lane & 0x07); // The location written by the threads. int write_row = warp_g * Cta_tile::M + warp_m * Mma_tile::M_PER_MMA + row; int write_col = warp_i; // Assemble the write pointer. smem_write_ = &smem_[write_row * ELEMENTS_PER_ROW + write_col]; // Assemble the read pointer. smem_read_ = &smem_[warp_m * Mma_tile::M_PER_MMA + row]; } // Apply mask before softmax. Use 1 byte per MMA distributed as 1x8. template inline __device__ void apply_mask(Mask const& mask) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < 8; ++ii) { if (!mask.is_valid(mi, ni, 0, ii)) { elt_[mi][8 * ni + ii] = -FLT_MAX; } } } } } template inline __device__ void apply_mask_alibi(Mask const& mask, int head_id, AlibiParams const& alibi_params) { // 'if constexpr' because ALiBi is only defined for causal masks if constexpr (Kernel_traits::CAUSAL_MASK) { float m = get_alibi_head_scaling_factor(head_id, alibi_params); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < 8; ++ii) { int row, col; mask.get_row_col(row, col, mi, ni, 0, ii); if (mask.is_valid(row, col)) { // Since softmax is shift invariant, // it is sufficient just to use the column as the multiplier elt_[mi][8 * ni + ii] = elt_[mi][8 * ni + ii] * alibi_params.scale_after_alibi + m * (col + alibi_params.sequence_pos_offset); } else { elt_[mi][8 * ni + ii] = -FLT_MAX; } } } } } else { __builtin_unreachable(); } } // Apply the mask to unpacked data. inline __device__ void apply_mask(uint32_t const (&packed_mask)[MMAS_M]) { // This code works only if we have MMAS_N <= 4. static_assert(MMAS_N <= 4, ""); // Expand the mask. int mask[MMAS_M][MMAS_N * 8]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < MMAS_N * 8; ++ii) { mask[mi][ii] = packed_mask[mi] & (1u << ii); } } // Apply the mask. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 8; ++ni) { if (!mask[mi][ni]) { elt_[mi][ni] = -FLT_MAX; } } } } // Mask the elements that are outside the the sequence length. inline __device__ void apply_mask(int const seqlen) { // The warp/lane decomposition. int const warp = threadIdx.x / Cta_tile::THREADS_PER_WARP; int const lane = threadIdx.x % Cta_tile::THREADS_PER_WARP; // The warp in the n dimension. int const warp_n = warp / Cta_tile::WARPS_M; // The base position within a quad. int const offset = warp_n * 16 + (threadIdx.x & 0x08) / 2; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // The position in the sequence. int pos = offset + ni * Mma_tile::N_PER_MMA_PER_CTA; // Determine the position in the sequence. if (pos + 0 >= seqlen) { elt_[mi][8 * ni + 0] = -FLT_MAX; } if (pos + 1 >= seqlen) { elt_[mi][8 * ni + 1] = -FLT_MAX; } if (pos + 2 >= seqlen) { elt_[mi][8 * ni + 2] = -FLT_MAX; } if (pos + 3 >= seqlen) { elt_[mi][8 * ni + 3] = -FLT_MAX; } if (pos + 8 >= seqlen) { elt_[mi][8 * ni + 4] = -FLT_MAX; } if (pos + 9 >= seqlen) { elt_[mi][8 * ni + 5] = -FLT_MAX; } if (pos + 10 >= seqlen) { elt_[mi][8 * ni + 6] = -FLT_MAX; } if (pos + 11 >= seqlen) { elt_[mi][8 * ni + 7] = -FLT_MAX; } } } } // Apply the exp to all the elements. // Need to make sure the results are zero when all elts are -FLT_MAX // as it is possible that all tokens are masked out. template inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : max[mi]; #pragma unroll for (int ni = 0; ni < MMAS_N * 8; ++ni) { this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val); } } } // Apply the exp to all the elements. inline __device__ void apply_exp(float const max) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 8; ++ni) { elt_[mi][ni] = apply_exp_(elt_[mi][ni], max); } } } // Apply the exp to all the elements. inline __device__ void apply_exp(float const (&max)[MMAS_M]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 8; ++ni) { elt_[mi][ni] = apply_exp_(elt_[mi][ni], max[mi]); } } } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(MMAS_M == M && MMAS_N == K, ""); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 8 elements per row. float tmp_0 = this->elt_[mi][8 * ki + 0]; float tmp_1 = this->elt_[mi][8 * ki + 1]; float tmp_2 = this->elt_[mi][8 * ki + 2]; float tmp_3 = this->elt_[mi][8 * ki + 3]; float tmp_4 = this->elt_[mi][8 * ki + 4]; float tmp_5 = this->elt_[mi][8 * ki + 5]; float tmp_6 = this->elt_[mi][8 * ki + 6]; float tmp_7 = this->elt_[mi][8 * ki + 7]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_16bit_2(tmp_0, tmp_1); dst[ki][mi].reg(1) = fmha::float2_to_16bit_2(tmp_2, tmp_3); dst[ki][mi].reg(2) = fmha::float2_to_16bit_2(tmp_4, tmp_5); dst[ki][mi].reg(3) = fmha::float2_to_16bit_2(tmp_6, tmp_7); } } } // Do a CTA-wide reduction. template inline __device__ void reduce_Nx1(float (&dst)[MMAS_M]) { #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { // The thread local math in the reference code. float sums[MMAS_N * 2]; #pragma unroll for (int ii = 0; ii < MMAS_N * 2; ++ii) { sums[ii] = elt_[mi][4 * ii + 0]; sums[ii] += elt_[mi][4 * ii + 1]; sums[ii] += elt_[mi][4 * ii + 2]; sums[ii] += elt_[mi][4 * ii + 3]; } // Columns 0 and 8: __shfl( 2). #pragma unroll for (int ii = 0; ii < MMAS_N; ++ii) { sums[2 * ii] += sums[2 * ii + 1]; } // Columns 0 and 32: __shfl( 8). #pragma unroll for (int ii = 0; ii < MMAS_N / 2; ++ii) { // MMAS_N / 2 == 0 if MMAS_N <= 1. sums[4 * ii] += sums[4 * ii + 2]; } // Columns 0 and 64: __shfl(16). if (MMAS_N == 3) { sums[0] += sums[4]; } else if (MMAS_N >= 4) { #pragma unroll for (int ii = 0; ii < MMAS_N / 4; ++ii) { // MMAS_N / 4 == 0 if MMAS_N <= 2. sums[8 * ii] += sums[8 * ii + 4]; } } // Store the final value for that row. dst[mi] = sums[0]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 8; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8)); } } // Do a CTA-wide reduction. template inline __device__ float reduce_2x2() { float dst[MMAS_M]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { // The thread local math in the reference code. float sums[MMAS_N * 2]; #pragma unroll for (int ii = 0; ii < MMAS_N * 2; ++ii) { sums[ii] = elt_[mi][4 * ii + 0]; sums[ii] += elt_[mi][4 * ii + 1]; sums[ii] += elt_[mi][4 * ii + 2]; sums[ii] += elt_[mi][4 * ii + 3]; } // Columns 0 and 8: __shfl( 2). #pragma unroll for (int ii = 0; ii < MMAS_N; ++ii) { sums[2 * ii] += sums[2 * ii + 1]; } // Columns 0 and 32: __shfl( 8). #pragma unroll for (int ii = 0; ii < MMAS_N / 2; ++ii) { // MMAS_N / 2 == 0 if MMAS_N <= 1. sums[4 * ii] += sums[4 * ii + 2]; } // Columns 0 and 64: __shfl(16). if (MMAS_N == 3) { sums[0] += sums[4]; } else if (MMAS_N >= 4) { #pragma unroll for (int ii = 0; ii < MMAS_N / 4; ++ii) { // MMAS_N / 4 == 0 if MMAS_N <= 2. sums[8 * ii] += sums[8 * ii + 4]; } } // Store the final value for that row. dst[mi] = sums[0]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 8; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8)); } // Store the different values to shared memory. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 16 < 8) { smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 2 values (one for each warp). float2 tmp = reinterpret_cast(smem_)[tidx_]; // Compute the reduction of those 2 values in a binary-tree fashion. return Functor::apply(tmp.x, tmp.y); } // Do a CTA-wide reduction. template inline __device__ float reduce_1x4() { float dst[MMAS_M]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { // The thread local math in the reference code. float sums[MMAS_N * 2]; #pragma unroll for (int ii = 0; ii < MMAS_N * 2; ++ii) { sums[ii] = elt_[mi][4 * ii + 0]; sums[ii] += elt_[mi][4 * ii + 1]; sums[ii] += elt_[mi][4 * ii + 2]; sums[ii] += elt_[mi][4 * ii + 3]; } // Columns 0 and 128 (the ref code uses a step of 128). Not needed if SEQLEN <= 128. if (Cta_tile::N > 128) { #pragma unroll for (int ii = 0; ii < MMAS_N; ++ii) { sums[ii] += sums[MMAS_N + ii]; } } // Columns 0 and 8: __shfl( 2). #pragma unroll for (int ii = 0; ii < MMAS_N; ++ii) { sums[2 * ii] += sums[2 * ii + 1]; } // Columns 0 and 64: __shfl(16). #pragma unroll for (int ii = 0; ii < MMAS_N / 2; ++ii) { // MMAS_N / 2 == 0 if MMAS_N <= 1. sums[4 * ii] += sums[4 * ii + 2]; } // Store the final value for that row. dst[mi] = sums[0]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 8; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8)); } // Store the different values to shared memory. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 16 < 8) { smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 4 values (one for each warp). float2 tmp[2]; if (tidx_ < Cta_tile::M) { tmp[0] = reinterpret_cast(&smem_[0 * ELEMENTS / 2])[tidx_]; tmp[1] = reinterpret_cast(&smem_[1 * ELEMENTS / 2])[tidx_]; } // Compute the reduction of those 4 values in a binary-tree fashion. tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y); tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y); tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x); // Return the final reduction. return tmp[0].x; } // Do a CTA-wide reduction. template inline __device__ float reduce_1x8() { float dst[MMAS_M]; #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) if (Functor::IS_SUM) { // Apply the summation inside the thread for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { // The thread local math in the reference code. float sums[MMAS_N * 2]; #pragma unroll for (int ii = 0; ii < MMAS_N * 2; ++ii) { sums[ii] = elt_[mi][4 * ii + 0]; sums[ii] += elt_[mi][4 * ii + 1]; sums[ii] += elt_[mi][4 * ii + 2]; sums[ii] += elt_[mi][4 * ii + 3]; } // Columns 0 and 128 (the ref code uses a step of 128). Not needed if SEQLEN <= 128. #pragma unroll for (int ii = 1; ii < MMAS_N; ++ii) { sums[0] += sums[2 * ii + 0]; sums[1] += sums[2 * ii + 1]; } // Columns 0 and 8: __shfl( 2). dst[mi] = sums[0] + sums[1]; } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * 8; ++ni) { dst[mi] = Functor::apply(dst[mi], elt_[mi][ni]); } } } // Apply the functor for each row. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 8)); } // Store the different values to shared memory. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { if (tidx_ % 16 < 8) { smem_write_[mi * Mma_tile::M_PER_MMA_PER_CTA * ELEMENTS_PER_ROW] = dst[mi]; } } // Make sure the values are in shared memory. __syncthreads(); // Load 8 values (one for each warp). float2 tmp[4]; if (tidx_ < Cta_tile::M) { tmp[0] = reinterpret_cast(&smem_[0 * ELEMENTS / 4])[tidx_]; tmp[1] = reinterpret_cast(&smem_[1 * ELEMENTS / 4])[tidx_]; tmp[2] = reinterpret_cast(&smem_[2 * ELEMENTS / 4])[tidx_]; tmp[3] = reinterpret_cast(&smem_[3 * ELEMENTS / 4])[tidx_]; } // // DEBUG. // if( tidx_ == 0 ) { // #pragma unroll // for( int ii = 0; ii < 4; ++ii ) { // printf("tidx=%3d tmp[%d]=%8.3f %8.3f\n", tidx_, ii, tmp[ii].x, tmp[ii].y); // } // } // // END OF DEBUG. // Compute the reduction of those 8 values in a binary-tree fashion. tmp[0].x = Functor::apply(tmp[0].x, tmp[0].y); tmp[1].x = Functor::apply(tmp[1].x, tmp[1].y); tmp[2].x = Functor::apply(tmp[2].x, tmp[2].y); tmp[3].x = Functor::apply(tmp[3].x, tmp[3].y); tmp[0].x = Functor::apply(tmp[0].x, tmp[1].x); tmp[2].x = Functor::apply(tmp[2].x, tmp[3].x); tmp[0].x = Functor::apply(tmp[0].x, tmp[2].x); // Return the final reduction. return tmp[0].x; } // Do a CTA-wide reduction. template inline __device__ float reduce_() { // The final reduction. float red = 0.f; // SEQLEN == 128. if (Cta_tile::WARPS_M == 2 && Cta_tile::WARPS_N == 2) { red = reduce_2x2(); // SEQLEN == 256. } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 4) { red = reduce_1x4(); // SEQLEN == 256. } else if (Cta_tile::WARPS_M == 1 && Cta_tile::WARPS_N == 8) { red = reduce_1x8(); // Not supported. } else { assert(false); } return red; } // Finalize the reduction. inline __device__ void shuffle(float (&dst)[MMAS_M], float red) { // Store the value back to shared memory. if (tidx_ < Cta_tile::M) { smem_[tidx_] = red; } // Make sure the data is in shared memory. __syncthreads(); // Finally read the values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { dst[mi] = smem_read_[mi * Mma_tile::M_PER_MMA_PER_CTA]; } // Make sure we are done reading shared memory. __syncthreads(); } // Do a CTA-wide reduction. template inline __device__ void reduce(float (&dst)[MMAS_M]) { // NOTE: 1 warp along reduce direction, no syncs if (Cta_tile::WARPS_N == 1) { reduce_Nx1(dst); } else { // The result of the reduction. Threads 0..Cta_tile::M-1 own a valid value. float red = reduce_(); // Make sure we can write to shared memory. __syncthreads(); // Finalize the reduction. shuffle(dst, red); } } // Scale all the elements. inline __device__ void scale(float const (&sum)[MMAS_M]) { // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal. float inv_sum[MMAS_M]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi]; } // Update the values. #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * 8; ++ni) { elt_[mi][ni] *= inv_sum[mi]; } } } // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // The elements. float tmp_00 = this->elt_[mi][8 * ni + 0]; float tmp_01 = this->elt_[mi][8 * ni + 1]; float tmp_02 = this->elt_[mi][8 * ni + 2]; float tmp_03 = this->elt_[mi][8 * ni + 3]; float tmp_04 = this->elt_[mi][8 * ni + 4]; float tmp_05 = this->elt_[mi][8 * ni + 5]; float tmp_06 = this->elt_[mi][8 * ni + 6]; float tmp_07 = this->elt_[mi][8 * ni + 7]; // Transform to accumulators. acc[mi][ni].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); acc[mi][ni].reg(1) = fmha::float2_to_16bit_2(tmp_02, tmp_03); acc[mi][ni].reg(2) = fmha::float2_to_16bit_2(tmp_04, tmp_05); acc[mi][ni].reg(3) = fmha::float2_to_16bit_2(tmp_06, tmp_07); } } // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Convert from FP16 fragments to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Normalize the values, and clamp to finite half. uint32_t acc_0 = satfinite_h2(hmul2(acc[mi][ni].reg(0), params_scale_bmm1_)); uint32_t acc_1 = satfinite_h2(hmul2(acc[mi][ni].reg(1), params_scale_bmm1_)); uint32_t acc_2 = satfinite_h2(hmul2(acc[mi][ni].reg(2), params_scale_bmm1_)); uint32_t acc_3 = satfinite_h2(hmul2(acc[mi][ni].reg(3), params_scale_bmm1_)); // Extract the values as floats. half2_to_float2(this->elt_[mi][8 * ni + 0], this->elt_[mi][8 * ni + 1], acc_0); half2_to_float2(this->elt_[mi][8 * ni + 2], this->elt_[mi][8 * ni + 3], acc_1); half2_to_float2(this->elt_[mi][8 * ni + 4], this->elt_[mi][8 * ni + 5], acc_2); half2_to_float2(this->elt_[mi][8 * ni + 6], this->elt_[mi][8 * ni + 7], acc_3); if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { #pragma unroll for (int i = 0; i < 8; i++) { // 1.0f / softcapping_scale has been fused to scale_bmm1. this->elt_[mi][8 * ni + i] = params_softcapping_scale_bmm1_ * __tanhf(this->elt_[mi][8 * ni + i]); } } } } } // The scaling factor. uint32_t const params_scale_bmm1_; float const params_softcapping_scale_bmm1_; // Shared memory for the CTA-wide reduction. float *smem_, *smem_write_, *smem_read_; // The current thread index. int tidx_; // The elements. float elt_[MMAS_M][MMAS_N * 8]; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_hmma { // The traits. using Traits = fmha::Turing_hmma_fp16_traits; // The base class. using Base = Softmax_hmma; // The fragment. using Fragment_a = fmha::Fragment_a; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N * 4 == K * 2, ""); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 2 elements per row. float tmp_00 = this->elt_[2 * mi + 0][2 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][2 * ki + 1]; // 2nd row - 2 elements per row. float tmp_10 = this->elt_[2 * mi + 1][2 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][2 * ki + 1]; // Pack to 2 registers. dst[ki][mi].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_imma { // The traits. using Traits = fmha::Volta_imma_int8_int32_traits; // The base class. using Base = Softmax_imma; // The fragment. using Fragment_a = fmha::Fragment_a; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Repack. We could use store/load to match the Smem_tile API. template inline __device__ void pack(Fragment_a (&dst)[K][M]) { static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N == K, ""); float const scale = reinterpret_cast(this->params_scale_softmax_); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0] * scale; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1] * scale; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2] * scale; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3] * scale; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0] * scale; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1] * scale; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2] * scale; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3] * scale; // Pack to 2 registers. dst[ki][mi].reg(0) = float4_to_char4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = float4_to_char4(tmp_10, tmp_11, tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_imma { // The traits. using Traits = fmha::Turing_imma_int8_int32_traits; // The base class. using Base = Softmax_imma; // The fragment. using Fragment_a = fmha::Fragment_a; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Repack. We could use store/load to match the Smem_tile API. template inline __device__ void pack(Fragment_a (&dst)[K][M]) { static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N == K, ""); float const scale = reinterpret_cast(this->params_scale_softmax_); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0] * scale; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1] * scale; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2] * scale; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3] * scale; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0] * scale; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1] * scale; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2] * scale; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3] * scale; // Pack to 2 registers. dst[ki][mi].reg(0) = float4_to_char4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = float4_to_char4(tmp_10, tmp_11, tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_hmma { // The traits. using Traits = fmha::Ampere_hmma_fp16_traits; // The base class. using Base = Softmax_hmma; // The fragment. using Fragment_a = fmha::Fragment_a; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); dst[ki][mi].reg(2) = fmha::float2_to_16bit_2(tmp_02, tmp_03); dst[ki][mi].reg(3) = fmha::float2_to_16bit_2(tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_fp32 : public Softmax_hmma { // The base class. using Base = Softmax_hmma; // The fragment. using Fragment_a = fmha::Fragment_a; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // Output accumulators (after conversion). using Accumulator_out = fmha::Fragment_accumulator; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // DEBUG. static_assert(Accumulator_out::NUM_REGS == 4, ""); // END OF DEBUG. // DEBUG. static_assert(std::is_same::value, ""); // END OF DEBUG. enum { WARPS_M = Cta_tile::WARPS_M }; enum { WARPS_N = Cta_tile::WARPS_N }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; using Smem_tile_red = Smem_tile_reduce; static_assert(Smem_tile_red::ELTS_PER_TILE == Cta_tile::M * WARPS_N); // Ctor. template inline __device__ Softmax_fp32(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), smem_sum_(static_cast(smem), tidx), smem_max_(static_cast(smem) + Smem_tile_red::ELTS_PER_TILE, tidx) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator_out acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // The elements. float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3]; float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3]; // Transform to accumulators. acc[mi][ni].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); acc[mi][ni].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); acc[mi][ni].reg(2) = fmha::float2_to_16bit_2(tmp_02, tmp_03); acc[mi][ni].reg(3) = fmha::float2_to_16bit_2(tmp_12, tmp_13); } } // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(Fragment_a::NUM_REGS == 4, ""); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); dst[ki][mi].reg(2) = fmha::float2_to_16bit_2(tmp_02, tmp_03); dst[ki][mi].reg(3) = fmha::float2_to_16bit_2(tmp_12, tmp_13); } } } // Pack the data to a uint4 for the next operation. template inline __device__ void pack(uint4 (&dst)[M][N]) const { #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ni = 0; ni < N; ++ni) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ni + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ni + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ni + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ni + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ni + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ni + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ni + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ni + 3]; // Pack to 4 registers. dst[mi][ni].x = fmha::float2_to_16bit_2(tmp_00, tmp_01); dst[mi][ni].y = fmha::float2_to_16bit_2(tmp_02, tmp_03); dst[mi][ni].z = fmha::float2_to_16bit_2(tmp_10, tmp_11); dst[mi][ni].w = fmha::float2_to_16bit_2(tmp_12, tmp_13); } } } // Scale FP32 fragments inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const scalef = reinterpret_cast(this->params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // 1st row - 4 elements per row. this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scalef; this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scalef; this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scalef; this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scalef; // 2nd row - 4 elements per row. this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scalef; this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scalef; this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scalef; this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scalef; // Attention logit softcapping scale. // 1.0f / softcapping_scale has been fused to scale_bmm1. if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { this->elt_[2 * mi + 0][4 * ni + 0] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 0]); this->elt_[2 * mi + 0][4 * ni + 1] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 1]); this->elt_[2 * mi + 1][4 * ni + 0] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 0]); this->elt_[2 * mi + 1][4 * ni + 1] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 1]); this->elt_[2 * mi + 0][4 * ni + 2] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 2]); this->elt_[2 * mi + 0][4 * ni + 3] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 0][4 * ni + 3]); this->elt_[2 * mi + 1][4 * ni + 2] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 2]); this->elt_[2 * mi + 1][4 * ni + 3] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[2 * mi + 1][4 * ni + 3]); } } } } // Scale FP32 fragments inline __device__ void unpack_noscale(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // 1st row - 4 elements per row. this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0); this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1); this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4); this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5); // 2nd row - 4 elements per row. this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2); this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3); this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6); this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7); } } } template __device__ inline void reduce_(float (&frag)[2 * MMAS_M], Operator& op, Smem_tile_red& smem_red) { #pragma unroll for (int mi = 0; mi < 2 * MMAS_M; mi++) { frag[mi] = this->elt_[mi][0]; #pragma unroll for (int ni = 1; ni < 4 * MMAS_N; ni++) { frag[mi] = op(frag[mi], this->elt_[mi][ni]); } } quad_reduce(frag, frag, op); if (WARPS_N > 1) { smem_red.store(frag); __syncthreads(); typename Smem_tile_red::read_t tmp[2 * MMAS_M]; smem_red.load(tmp); quad_allreduce(frag, tmp, op); } } __device__ inline void reduce_max(float (&frag)[2 * MMAS_M]) { MaxOp max; reduce_(frag, max, smem_max_); } __device__ inline void reduce_sum(float (&frag)[2 * MMAS_M]) { SumOp sum; reduce_(frag, sum, smem_sum_); } __device__ inline float correct(float warp_sum, float warp_max, float max) { return warp_sum * __expf(warp_max - max); } __device__ inline float2 correct(float2 warp_sum, float2 warp_max, float max) { return {correct(warp_sum.x, warp_max.x, max), correct(warp_sum.y, warp_max.y, max)}; } __device__ inline void online_softmax() { MaxOp maxOp; SumOp sumOp; float max[2 * MMAS_M]; #pragma unroll for (int mi = 0; mi < 2 * MMAS_M; mi++) { max[mi] = this->elt_[mi][0]; #pragma unroll for (int ni = 1; ni < 4 * MMAS_N; ni++) { max[mi] = maxOp(max[mi], this->elt_[mi][ni]); } } quad_allreduce(max, max, maxOp); smem_max_.store(max); float sum[2 * MMAS_M]; #pragma unroll for (int mi = 0; mi < 2 * MMAS_M; mi++) { sum[mi] = 0.f; #pragma unroll for (int ni = 0; ni < 4 * MMAS_N; ni++) { float x = this->elt_[mi][ni]; this->elt_[mi][ni] = __expf(x - max[mi]); sum[mi] += this->elt_[mi][ni]; } } quad_allreduce(sum, sum, sumOp); smem_sum_.store(sum); __syncthreads(); typename Smem_tile_red::read_t tmp_max[2 * MMAS_M]; typename Smem_tile_red::read_t tmp_sum[2 * MMAS_M]; smem_max_.load(tmp_max); smem_sum_.load(tmp_sum); float full_max[2 * MMAS_M]; quad_allreduce(full_max, tmp_max, maxOp); #pragma unroll for (int mi = 0; mi < 2 * MMAS_M; mi++) { tmp_sum[mi] = correct(tmp_sum[mi], tmp_max[mi], full_max[mi]); } quad_allreduce(sum, tmp_sum, sumOp); #pragma unroll for (int mi = 0; mi < 2 * MMAS_M; mi++) { float correction = __expf(max[mi] - full_max[mi]) / sum[mi]; #pragma unroll for (int ni = 0; ni < 4 * MMAS_N; ni++) { this->elt_[mi][ni] *= correction; } } } Smem_tile_red smem_max_; Smem_tile_red smem_sum_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_fp32 { // The traits. using Traits = fmha::Ampere_hmma_fp32_traits; // The base class. using Base = Softmax_fp32; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_fp32 { // The traits. using Traits = fmha::Turing_hmma_fp32_traits; // The base class. using Base = Softmax_fp32; // The fragment. using Fragment_a = fmha::Fragment_a; // Softmax dst data_type (BMM2 input) using Dst_type = typename Traits::A_type; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(Fragment_a::NUM_REGS == 2, ""); static_assert(Base::Mma_tile::MMAS_M == M && Base::Mma_tile::MMAS_N * 4 == K * 2, ""); #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 2 elements per row. float tmp_00 = this->elt_[2 * mi + 0][2 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][2 * ki + 1]; // 2nd row - 2 elements per row. float tmp_10 = this->elt_[2 * mi + 1][2 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][2 * ki + 1]; // Pack to 2 registers. dst[ki][mi].reg(0) = fmha::float2_to_16bit_2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_16bit_2(tmp_10, tmp_11); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_fp32 { // The traits. using Traits = fmha::Ampere_hmma_bf16_traits; // The base class. using Base = Softmax_fp32; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_imma { // The traits. using Traits = fmha::Ampere_imma_int8_int32_traits; // The base class. using Base = Softmax_imma; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_qmma { // The traits. using Traits = fmha::Ada_qmma_e4m3_fp32_traits; // The base class. using Base = Softmax_qmma; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_qmma { // The traits. using Traits = fmha::Ada_qmma_e4m3_fp16_traits; // The base class. using Base = Softmax_qmma; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax : public Softmax_imma { // The Traits using Traits = fmha::Ada_qmma_e4m3_fp32_traits; // The base class. using Base = Softmax_imma; // The MMAs. enum { MMAS_M = Base::MMAS_M }; enum { MMAS_N = Base::MMAS_N }; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_bmm1_(params.scale_bmm1_d ? *params.scale_bmm1_d : params.scale_bmm1), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // scale acc[mi][ni].ele(0) = this->elt_[2 * mi + 0][4 * ni + 0] * scale; acc[mi][ni].ele(1) = this->elt_[2 * mi + 0][4 * ni + 1] * scale; acc[mi][ni].ele(4) = this->elt_[2 * mi + 0][4 * ni + 2] * scale; acc[mi][ni].ele(5) = this->elt_[2 * mi + 0][4 * ni + 3] * scale; acc[mi][ni].ele(2) = this->elt_[2 * mi + 1][4 * ni + 0] * scale; acc[mi][ni].ele(3) = this->elt_[2 * mi + 1][4 * ni + 1] * scale; acc[mi][ni].ele(6) = this->elt_[2 * mi + 1][4 * ni + 2] * scale; acc[mi][ni].ele(7) = this->elt_[2 * mi + 1][4 * ni + 3] * scale; } } // Delegate to the gmem tile to store. // TODO: need fp32 to fp8 conversion (move this to gmem_tile) gmem_tile.store(acc); } // Convert from accumulators to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const scale = params_scale_q_ * params_scale_k_; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { // Convert to FP32 and scale. this->elt_[2 * mi + 0][4 * ni + 0] = acc[mi][ni].elt(0) * scale; this->elt_[2 * mi + 0][4 * ni + 1] = acc[mi][ni].elt(1) * scale; this->elt_[2 * mi + 0][4 * ni + 2] = acc[mi][ni].elt(4) * scale; this->elt_[2 * mi + 0][4 * ni + 3] = acc[mi][ni].elt(5) * scale; this->elt_[2 * mi + 1][4 * ni + 0] = acc[mi][ni].elt(2) * scale; this->elt_[2 * mi + 1][4 * ni + 1] = acc[mi][ni].elt(3) * scale; this->elt_[2 * mi + 1][4 * ni + 2] = acc[mi][ni].elt(6) * scale; this->elt_[2 * mi + 1][4 * ni + 3] = acc[mi][ni].elt(7) * scale; } } } template inline __device__ void apply_exp_with_mask(float const (&max)[MMAS_M * 2]) { #pragma unroll for (int mi = 0; mi < MMAS_M * 2; ++mi) { float max_val = APPLY_MASK && max[mi] == -FLT_MAX ? 0.f : (max[mi] - logf(Traits::SOFTMAX_FP_QUANT_SCALE)); #pragma unroll for (int ni = 0; ni < MMAS_N * 4; ++ni) { this->elt_[mi][ni] = expf(this->elt_[mi][ni] - max_val); } } } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { float const scale = reinterpret_cast(this->params_scale_softmax_); // The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19] // Note below that this is not possible with the register layout of the accumulator. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 8 elements per row. float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale; // + 0 float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale; // + 1 float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale; // + 8 float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale; // + 9 float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale; // +16 float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale; // +17 float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale; // +24 float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale; // +25 // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale; // + 0 float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale; // + 1 float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale; // + 8 float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale; // + 9 float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale; // +16 float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale; // +17 float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale; // +24 float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale; // +25 // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float4_to_fp8x4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = fmha::float4_to_fp8x4(tmp_10, tmp_11, tmp_12, tmp_13); dst[ki][mi].reg(2) = fmha::float4_to_fp8x4(tmp_04, tmp_05, tmp_06, tmp_07); dst[ki][mi].reg(3) = fmha::float4_to_fp8x4(tmp_14, tmp_15, tmp_16, tmp_17); } } } template inline __device__ void move_to_first_block(Params const& params, int bidb, int bidh, int q_loop) { int scale_q_iter = bidb * params.h * params.sage.q.max_nblock + bidh * params.sage.q.max_nblock + q_loop; params_scale_q_ = __ldg(params.sage.q.scales + scale_q_iter); params_scale_q_ *= reinterpret_cast(params_scale_bmm1_); int scale_k_iter = bidb * params.h * params.sage.k.max_nblock + bidh * params.sage.k.max_nblock; params_scale_k_iter = reinterpret_cast(params.sage.k.scales + scale_k_iter); params_scale_k_ = __ldg(params_scale_k_iter); } inline __device__ void move_to_next_block() { params_scale_k_iter += 1; params_scale_k_ = __ldg(params_scale_k_iter); } // The scaling factors. uint32_t const params_scale_bmm1_, params_scale_softmax_; float params_scale_q_, params_scale_k_; float const* params_scale_k_iter; }; //////////////////////////////////////////////////////////////////////////////////////////////////// // HOPPER SOFTMAX //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_gmma_base {}; template struct Softmax_gmma_base { // The instruction traits. using Traits = Traits_; // The Cta_tile. using Cta_tile = Cta_tile_; // The Kernel traits. using Kernel_traits = Kernel_traits_; // The accumulators. using Accumulator = fmha::Fragment_accumulator; // The Mma tile. using Mma_tile = typename Traits::template Mma_tile; static_assert(Cta_tile::WARPS_M == 4); static_assert(Mma_tile::M_PER_MMA_PER_CTA == 64); // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // Elements per thread per core matrix. enum { ELTS_PER_THREAD = 2 }; // Core matrix is always 8x4. enum { THREADS_PER_ROW = 4 }; enum { SMEM_BYTES = 0 }; // The number of rows accessed by each thread. enum { ROWS_PER_THREAD = Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M }; static_assert(ROWS_PER_THREAD == Mma_tile::ROWS_PER_THREAD); // The number of columns access by each thread. // Note there are 2 elements per reg. enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD }; // The number of total elements per thread. enum { TOTAL_ELTS_PER_THREAD = ELTS_PER_THREAD * COLS_PER_THREAD }; template inline __device__ Softmax_gmma_base(Params const& params, void*, int const, int const) : params_scale_bmm1_(params.scale_bmm1), params_softcapping_scale_bmm1_(params.softcapping_scale_bmm1) {} // Apply mask before softmax. Use 1 byte per MMA distributed as 2x4. template inline __device__ void apply_mask(Mask const& mask) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int jj = 0; jj < TOTAL_ELTS_PER_THREAD; ++jj) { if (!mask.is_valid(mi, ni, ii, jj)) { this->elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] = -FLT_MAX; } } // jj } // ni } // ii } // mi } template inline __device__ void apply_mask_alibi(Mask const& mask, int head_id, AlibiParams const& alibi_params) { // 'if constexpr' because ALiBi is only defined for causal masks if constexpr (Kernel_traits::CAUSAL_MASK) { float m = get_alibi_head_scaling_factor(head_id, alibi_params); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < ROWS_PER_THREAD; ++ii) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int jj = 0; jj < TOTAL_ELTS_PER_THREAD; ++jj) { int row, col; mask.get_row_col(row, col, mi, ni, ii, jj); if (mask.is_valid(row, col)) { // Since softmax is shift invariant, // it is sufficient just to use the column as the multiplier elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] = elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] * alibi_params.scale_after_alibi + m * (col + alibi_params.sequence_pos_offset); } else { elt_[ROWS_PER_THREAD * mi + ii][TOTAL_ELTS_PER_THREAD * ni + jj] = -FLT_MAX; } } } } } } else { __builtin_unreachable(); } } // Do a CTA-wide reduction. template inline __device__ void reduce_4x1(float (&dst)[MMAS_M * ROWS_PER_THREAD]) { #if defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == MMAS_N * Mma_tile::CORES_N * 2); if (Functor::IS_SUM) { // Apply the summation inside the thread. #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { dst[mi] = (this->elt_[mi][0] + this->elt_[mi][1]); #pragma unroll for (int ni = 1; ni < MMAS_N * Mma_tile::CORES_N; ni++) { dst[mi] += (this->elt_[mi][ni * 2 + 0] + this->elt_[mi][ni * 2 + 1]); } } } else #endif // defined(USE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE) { // find the max/sum for each row. // For hopper, each row is held entirely within 4 threads. // Apply the functor for each row inside a thread. #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { dst[mi] = this->elt_[mi][0]; #pragma unroll for (int ni = 1; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) { dst[mi] = Functor::apply(dst[mi], this->elt_[mi][ni]); } } } // Apply the functor for each row inside each group of 4 threads. #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 1)); __syncwarp(); dst[mi] = Functor::apply(dst[mi], __shfl_xor_sync(uint32_t(-1), dst[mi], 2)); __syncwarp(); } } // Do a CTA-wide reduction. template inline __device__ void reduce(float (&dst)[MMAS_M * ROWS_PER_THREAD]) { reduce_4x1(dst); } // Apply the exp to all the elements. inline __device__ void apply_exp(float const (&max)[MMAS_M * ROWS_PER_THREAD]) { #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) { this->elt_[mi][ni] = apply_exp_(this->elt_[mi][ni], max[mi]); } } } // Scale all the elements. inline __device__ void scale(float const (&sum)[MMAS_M * ROWS_PER_THREAD]) { // Precompute the inverse sum to normalize. Without -use_fast_math, it makes a huge deal. float inv_sum[MMAS_M * ROWS_PER_THREAD]; #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { inv_sum[mi] = (sum[mi] == 0.f || sum[mi] != sum[mi]) ? 1.f : 1.f / sum[mi]; } // Update the values. #pragma unroll for (int mi = 0; mi < MMAS_M * ROWS_PER_THREAD; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD; ++ni) { this->elt_[mi][ni] *= inv_sum[mi]; } } } // The scalig factor. Depens on acc type, e.g. float for 32-bit and fp16x2/bf16x2 for 16-bit. uint32_t const params_scale_bmm1_; float const params_softcapping_scale_bmm1_; // The elements. float elt_[MMAS_M * ROWS_PER_THREAD][MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD]; }; template struct Softmax_gmma_base : public Softmax_gmma_base { using Base = Softmax_gmma_base; using Mma_tile = typename Base::Mma_tile; enum { BYTES_PER_SMEM = Mma_tile::M_PER_MMA_PER_CTA * Cta_tile::WARPS_N * sizeof(float) }; enum { ELTS_PER_ROW = 2 }; static_assert(Cta_tile::WARPS_N == 2); static_assert(Cta_tile::WARPS_M == 4); static_assert(Mma_tile::M_PER_MMA_PER_CTA == 64); template inline __device__ Softmax_gmma_base(Params const& params, void* smem, int const bidb, int const tidx) : Base(params, smem, bidb, tidx) { int const warp = tidx / Cta_tile::THREADS_PER_WARP; int const warp_n = warp / 4; int const warp_m = warp % 4; int const lane = tidx % Cta_tile::THREADS_PER_WARP; int const quad = lane / 4; is_writer_ = lane % 4 == 0; int const col = warp_n; int const row = warp_m * 16 + quad; smem_write_ = static_cast(smem) + row * 2 + col; smem_read_ = static_cast(smem) + row; } // Do a CTA-wide reduction. template inline __device__ void reduce(float (&dst)[2]) { Base::template reduce_4x1(dst); if (is_writer_) { smem_write_[0 * ELTS_PER_ROW] = dst[0]; smem_write_[8 * ELTS_PER_ROW] = dst[1]; } __syncthreads(); float2 tmp0 = smem_read_[0]; float2 tmp1 = smem_read_[8]; dst[0] = Functor::apply(tmp0.x, tmp0.y); dst[1] = Functor::apply(tmp1.x, tmp1.y); } float* smem_write_; float2* smem_read_; bool is_writer_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax, Cta_tile_, Kernel_traits_> : public Softmax_gmma_base< fmha::Hopper_hgmma_fp16_traits, Cta_tile_, Kernel_traits_, Cta_tile_::WARPS_N> { // The traits. using Traits = fmha::Hopper_hgmma_fp16_traits; // Cta_tile. using Cta_tile = Cta_tile_; // Kernel_traits. using Kernel_traits = Kernel_traits_; // The Base class. using Base = Softmax_gmma_base; // The accumulators. using Accumulator = typename Base::Accumulator; // The Mma tile. using Mma_tile = typename Base::Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // for HGMMA_FP16, there are 2 elements per RF for ACC. enum { ELTS_PER_THREAD = 2 }; // for Hopper HGMMA, each row is held within 4 threads. enum { THREADS_PER_ROW = 4 }; // The number of rows accessed by each thread. enum { ROWS_PER_THREAD = Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M }; // The number of columns access by each thread. // Note there are 2 elements per reg. enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Convert from FP16 fragments to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally. // Normalize the values. uint32_t acc_0 = fmha::hmul2(acc[mi][ni].reg(col_idx * ROWS_PER_THREAD + row_idx), this->params_scale_bmm1_); // Element index. int elt_row_idx = ROWS_PER_THREAD * mi + row_idx; int elt_col_idx = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD; // Extract the values as floats. half2_to_float2(this->elt_[elt_row_idx][elt_col_idx + 0], this->elt_[elt_row_idx][elt_col_idx + 1], acc_0); // Attention logit softcapping scale. // 1.0f / softcapping_scale has been fused to scale_bmm1. if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { this->elt_[elt_row_idx][elt_col_idx + 0] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[elt_row_idx][elt_col_idx + 0]); this->elt_[elt_row_idx][elt_col_idx + 1] = this->params_softcapping_scale_bmm1_ * __tanhf(this->elt_[elt_row_idx][elt_col_idx + 1]); } } // row_idx } // col_idx } // ni } // mi } // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally. float tmp_00 = this->elt_[ROWS_PER_THREAD * mi + row_idx] [COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD + 0]; float tmp_01 = this->elt_[ROWS_PER_THREAD * mi + row_idx] [COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD + 1]; acc[mi][ni].reg(col_idx * ROWS_PER_THREAD + row_idx) = fmha::float2_to_half2(tmp_00, tmp_01); } // row_idx } // col_idx } // ni } // m // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { // we know the instruction shape is 64xNx16 // Thus for input A matrix, it is of size 64x16 per warpgroup. // Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11); dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03); dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax, Cta_tile_, Kernel_traits_> : public Softmax_gmma_base< fmha::Hopper_hgmma_fp32_traits, Cta_tile_, Kernel_traits_, Cta_tile_::WARPS_N> { // The traits. using Traits = fmha::Hopper_hgmma_fp32_traits; // Cta_tile. using Cta_tile = Cta_tile_; // Kernel_traits. using Kernel_traits = Kernel_traits_; // The Base class. using Base = Softmax_gmma_base; // The accumulators. using Accumulator = typename Base::Accumulator; // The Mma tile. using Mma_tile = typename Base::Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // for HGMMA_FP16, there are 2 elements per RF for ACC. enum { ELTS_PER_THREAD = 2 }; // for Hopper HGMMA, each row is held within 4 threads. enum { THREADS_PER_ROW = 4 }; // The number of rows accessed by each thread. enum { ROWS_PER_THREAD = Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M }; // The number of columns access by each thread. // Note there are 2 elements per reg. enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Convert from FP16 fragments to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const& scale_f = reinterpret_cast(this->params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally. int elt_row = ROWS_PER_THREAD * mi + row_idx; int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD; float elt0 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) * scale_f; float elt1 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) * scale_f; // 1.0f / softcapping_scale has been fused to scale_bmm1. if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { elt0 = this->params_softcapping_scale_bmm1_ * __tanhf(elt0); elt1 = this->params_softcapping_scale_bmm1_ * __tanhf(elt1); } this->elt_[elt_row][elt_col + 0] = elt0; this->elt_[elt_row][elt_col + 1] = elt1; } // row_idx } // col_idx } // ni } // mi } // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally int elt_row = ROWS_PER_THREAD * mi + row_idx; int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD; float elt0 = this->elt_[elt_row][elt_col + 0]; float elt1 = this->elt_[elt_row][elt_col + 1]; acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) = elt0; acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) = elt1; } // row_idx } // col_idx } // ni } // m // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { // we know the instruction shape is 64xNx16 // Thus for input A matrix, it is of size 64x16 per warpgroup. // Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_half2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_half2(tmp_10, tmp_11); dst[ki][mi].reg(2) = fmha::float2_to_half2(tmp_02, tmp_03); dst[ki][mi].reg(3) = fmha::float2_to_half2(tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax, Cta_tile_, Kernel_traits_> : public Softmax_gmma_base< fmha::Hopper_hgmma_bf16_traits, Cta_tile_, Kernel_traits_, Cta_tile_::WARPS_N> { // The traits. using Traits = fmha::Hopper_hgmma_bf16_traits; // Cta_tile. using Cta_tile = Cta_tile_; // Kernel_traits. using Kernel_traits = Kernel_traits_; // The Base class. using Base = Softmax_gmma_base; // The accumulators. using Accumulator = typename Base::Accumulator; // The Mma tile. using Mma_tile = typename Base::Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // for HGMMA_FP16, there are 2 elements per RF for ACC. enum { ELTS_PER_THREAD = 2 }; // for Hopper HGMMA, each row is held within 4 threads. enum { THREADS_PER_ROW = 4 }; // The number of rows accessed by each thread. enum { ROWS_PER_THREAD = Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M }; // The number of columns access by each thread. // Note there are 2 elements per reg. enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD }; // Use BMM1 softcapping scale or not. enum { ENABLE_BMM1_SOFTCAPPING_SCALE = Kernel_traits::ENABLE_BMM1_SOFTCAPPING_SCALE }; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} // Convert from FP16 fragments to floats. inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const& scale_f = reinterpret_cast(this->params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally. int elt_row = ROWS_PER_THREAD * mi + row_idx; int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD; float elt0 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) * scale_f; float elt1 = acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) * scale_f; if constexpr (ENABLE_BMM1_SOFTCAPPING_SCALE) { elt0 = this->params_softcapping_scale_bmm1_ * __tanhf(elt0); elt1 = this->params_softcapping_scale_bmm1_ * __tanhf(elt1); } this->elt_[elt_row][elt_col + 0] = elt0; this->elt_[elt_row][elt_col + 1] = elt1; } // row_idx } // col_idx } // ni } // mi } // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int col_idx = 0; col_idx < COLS_PER_THREAD; ++col_idx) { #pragma unroll for (int row_idx = 0; row_idx < ROWS_PER_THREAD; ++row_idx) { // the order of the acc rf is we traverse vertically first // then we traverse horizontally. int elt_row = ROWS_PER_THREAD * mi + row_idx; int elt_col = COLS_PER_THREAD * ELTS_PER_THREAD * ni + col_idx * ELTS_PER_THREAD; float elt0 = this->elt_[elt_row][elt_col + 0]; float elt1 = this->elt_[elt_row][elt_col + 1]; acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 0) = elt0; acc[mi][ni].elt(col_idx * 2 * ROWS_PER_THREAD + 2 * row_idx + 1) = elt1; } // row_idx } // col_idx } // ni } // m // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { // we know the instruction shape is 64xNx16 // Thus for input A matrix, it is of size 64x16 per warpgroup. // Thus, each threads access 2 rows and 4 columns. contiguous 2 columns are held by 1 RF. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 4 elements per row. float tmp_00 = this->elt_[2 * mi + 0][4 * ki + 0]; float tmp_01 = this->elt_[2 * mi + 0][4 * ki + 1]; float tmp_02 = this->elt_[2 * mi + 0][4 * ki + 2]; float tmp_03 = this->elt_[2 * mi + 0][4 * ki + 3]; // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][4 * ki + 0]; float tmp_11 = this->elt_[2 * mi + 1][4 * ki + 1]; float tmp_12 = this->elt_[2 * mi + 1][4 * ki + 2]; float tmp_13 = this->elt_[2 * mi + 1][4 * ki + 3]; // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float2_to_bf16_x2(tmp_00, tmp_01); dst[ki][mi].reg(1) = fmha::float2_to_bf16_x2(tmp_10, tmp_11); dst[ki][mi].reg(2) = fmha::float2_to_bf16_x2(tmp_02, tmp_03); dst[ki][mi].reg(3) = fmha::float2_to_bf16_x2(tmp_12, tmp_13); } } } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax_gmma_32bit_8bit_base : public Softmax_gmma_base { // The Base class. using Base = Softmax_gmma_base; // The accumulators. using Accumulator = typename Base::Accumulator; // The Mma tile. using Mma_tile = typename Base::Mma_tile; // The number of MMAs in M/N dimensions. enum { MMAS_M = Mma_tile::MMAS_M }; enum { MMAS_N = Mma_tile::MMAS_N }; // TODO these should be general. // Two elts per thread per acc core matrix. enum { ELTS_PER_THREAD = 2 }; // Number of threads per row of the acc core matrix. enum { THREADS_PER_ROW = 4 }; // The number of rows accessed by each thread per GMMA. enum { ROWS_PER_THREAD = Traits::GMMA_M / (Cta_tile::THREADS_PER_WARP / THREADS_PER_ROW) / Cta_tile::WARPS_M }; // The number of columns access by each thread. enum { COLS_PER_THREAD = Traits::GMMA_N / THREADS_PER_ROW / ELTS_PER_THREAD }; // Check the expected number of accumulator elements. static_assert(Accumulator::NUM_ELTS == COLS_PER_THREAD * ROWS_PER_THREAD * ELTS_PER_THREAD); // Ctor. template inline __device__ Softmax_gmma_32bit_8bit_base(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx) {} inline __device__ void unpack(Accumulator const (&acc)[MMAS_M][MMAS_N]) { float const scalef = reinterpret_cast(this->params_scale_bmm1_); #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < COLS_PER_THREAD; ++ii) { float tmp_00 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 0) * scalef; float tmp_01 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 1) * scalef; float tmp_10 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 0) * scalef; float tmp_11 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 1) * scalef; int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD; this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0] = tmp_00; this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1] = tmp_01; this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0] = tmp_10; this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1] = tmp_11; } // ii } // ni } // mi } inline __device__ void unpack_noscale(Accumulator const (&acc)[MMAS_M][MMAS_N]) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < COLS_PER_THREAD; ++ii) { float tmp_00 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 0); float tmp_01 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 0 * ELTS_PER_THREAD + 1); float tmp_10 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 0); float tmp_11 = acc[mi][ni].elt(ii * ROWS_PER_THREAD * ELTS_PER_THREAD + 1 * ELTS_PER_THREAD + 1); int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD; this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0] = tmp_00; this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1] = tmp_01; this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0] = tmp_10; this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1] = tmp_11; } // ii } // ni } // mi } }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax, Cta_tile, Kernel_traits> : public Softmax_gmma_32bit_8bit_base< fmha::Hopper_qgmma_e4m3_fp32_traits, Cta_tile, Kernel_traits> { // The traits. using Traits = fmha::Hopper_qgmma_e4m3_fp32_traits; // The Base class. using Base = Softmax_gmma_32bit_8bit_base; using Accumulator = typename Base::Accumulator; enum { MMAS_M = Base::MMAS_M, MMAS_N = Base::MMAS_N, ROWS_PER_THREAD = Base::ROWS_PER_THREAD, COLS_PER_THREAD = Base::COLS_PER_THREAD, ELTS_PER_THREAD = Base::ELTS_PER_THREAD, }; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(this->params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < COLS_PER_THREAD; ++ii) { int row = mi * ROWS_PER_THREAD; int col = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD; float tmp_00 = this->elt_[row + 0][col + 0] * scale; float tmp_01 = this->elt_[row + 0][col + 1] * scale; float tmp_10 = this->elt_[row + 1][col + 0] * scale; float tmp_11 = this->elt_[row + 1][col + 1] * scale; int elt_idx = ii * ROWS_PER_THREAD * ELTS_PER_THREAD; acc[mi][ni].elt(elt_idx + 0 * ELTS_PER_THREAD + 0) = tmp_00; acc[mi][ni].elt(elt_idx + 0 * ELTS_PER_THREAD + 1) = tmp_01; acc[mi][ni].elt(elt_idx + 1 * ELTS_PER_THREAD + 0) = tmp_10; acc[mi][ni].elt(elt_idx + 1 * ELTS_PER_THREAD + 1) = tmp_11; } // ii } // ni } // mi // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(M == 1); static_assert(Fragment_a::NUM_REGS == 4); static_assert(Fragment_a::NUM_ELTS == 16); // Acc per warp: 16 x 256 FP32 // A is 8 times(in K) 16 x 32 FP8, i.e. 4 registers per thread. static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD % 8 == 0); static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == K * Fragment_a::NUM_ELTS / 2); float const scale = reinterpret_cast(this->params_scale_softmax_); // The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19] // Note below that this is not possible with the register layout of the accumulator. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 8 elements per row. float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale; // + 0 float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale; // + 1 float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale; // + 8 float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale; // + 9 float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale; // +16 float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale; // +17 float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale; // +24 float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale; // +25 // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale; // + 0 float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale; // + 1 float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale; // + 8 float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale; // + 9 float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale; // +16 float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale; // +17 float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale; // +24 float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale; // +25 // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float4_to_fp8x4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = fmha::float4_to_fp8x4(tmp_10, tmp_11, tmp_12, tmp_13); dst[ki][mi].reg(2) = fmha::float4_to_fp8x4(tmp_04, tmp_05, tmp_06, tmp_07); dst[ki][mi].reg(3) = fmha::float4_to_fp8x4(tmp_14, tmp_15, tmp_16, tmp_17); } } } uint32_t const params_scale_softmax_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// template struct Softmax, Cta_tile, Kernel_traits> : public Softmax_gmma_32bit_8bit_base< fmha::Hopper_igmma_int8_int32_traits, Cta_tile, Kernel_traits> { // The traits. using Traits = fmha::Hopper_igmma_int8_int32_traits; // The Base class. using Base = Softmax_gmma_32bit_8bit_base; using Accumulator = typename Base::Accumulator; enum { MMAS_M = Base::MMAS_M, MMAS_N = Base::MMAS_N, ROWS_PER_THREAD = Base::ROWS_PER_THREAD, COLS_PER_THREAD = Base::COLS_PER_THREAD, ELTS_PER_THREAD = Base::ELTS_PER_THREAD, }; // Ctor. template inline __device__ Softmax(Params const& params, void* smem, int bidb, int tidx) : Base(params, smem, bidb, tidx), params_scale_softmax_(params.scale_softmax) {} // Store the tile after softmax. template inline __device__ void store(Gmem_tile& gmem_tile) { float const scale = reinterpret_cast(this->params_scale_softmax_); Accumulator acc[MMAS_M][MMAS_N]; #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ni = 0; ni < MMAS_N; ++ni) { #pragma unroll for (int ii = 0; ii < COLS_PER_THREAD; ++ii) { int n_offset = ni * COLS_PER_THREAD * ELTS_PER_THREAD + ii * ELTS_PER_THREAD; float tmp_00 = this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 0]; float tmp_01 = this->elt_[mi * ROWS_PER_THREAD + 0][n_offset + 1]; float tmp_10 = this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 0]; float tmp_11 = this->elt_[mi * ROWS_PER_THREAD + 1][n_offset + 1]; int elt_offset = ii * ROWS_PER_THREAD * ELTS_PER_THREAD; acc[mi][ni].elt(elt_offset + 0 * ELTS_PER_THREAD + 0) = tmp_00 * scale; acc[mi][ni].elt(elt_offset + 0 * ELTS_PER_THREAD + 1) = tmp_01 * scale; acc[mi][ni].elt(elt_offset + 1 * ELTS_PER_THREAD + 0) = tmp_10 * scale; acc[mi][ni].elt(elt_offset + 1 * ELTS_PER_THREAD + 1) = tmp_11 * scale; } // ii } // ni } // mi // Delegate to the gmem tile to store. gmem_tile.store(acc); } // Pack the data to a fragment for the next GEMM. template inline __device__ void pack(Fragment_a (&dst)[K][M]) const { static_assert(M == 1); static_assert(Fragment_a::NUM_REGS == 4); static_assert(Fragment_a::NUM_ELTS == 16); // Acc per warp: 16 x 256 FP32 // A is 8 times(in K) 16 x 32 FP8, i.e. 4 registers per thread. static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD % 8 == 0); static_assert(MMAS_N * COLS_PER_THREAD * ELTS_PER_THREAD == K * Fragment_a::NUM_ELTS / 2); float const scale = reinterpret_cast(this->params_scale_softmax_); // The canonical layout in K should be R0: [0,1,2,3] R2: [16,17,18,19] // Note below that this is not possible with the register layout of the accumulator. #pragma unroll for (int mi = 0; mi < M; ++mi) { #pragma unroll for (int ki = 0; ki < K; ++ki) { // 1st row - 8 elements per row. float tmp_00 = this->elt_[2 * mi + 0][8 * ki + 0] * scale; // + 0 float tmp_01 = this->elt_[2 * mi + 0][8 * ki + 1] * scale; // + 1 float tmp_02 = this->elt_[2 * mi + 0][8 * ki + 2] * scale; // + 8 float tmp_03 = this->elt_[2 * mi + 0][8 * ki + 3] * scale; // + 9 float tmp_04 = this->elt_[2 * mi + 0][8 * ki + 4] * scale; // +16 float tmp_05 = this->elt_[2 * mi + 0][8 * ki + 5] * scale; // +17 float tmp_06 = this->elt_[2 * mi + 0][8 * ki + 6] * scale; // +24 float tmp_07 = this->elt_[2 * mi + 0][8 * ki + 7] * scale; // +25 // 2nd row - 4 elements per row. float tmp_10 = this->elt_[2 * mi + 1][8 * ki + 0] * scale; // + 0 float tmp_11 = this->elt_[2 * mi + 1][8 * ki + 1] * scale; // + 1 float tmp_12 = this->elt_[2 * mi + 1][8 * ki + 2] * scale; // + 8 float tmp_13 = this->elt_[2 * mi + 1][8 * ki + 3] * scale; // + 9 float tmp_14 = this->elt_[2 * mi + 1][8 * ki + 4] * scale; // +16 float tmp_15 = this->elt_[2 * mi + 1][8 * ki + 5] * scale; // +17 float tmp_16 = this->elt_[2 * mi + 1][8 * ki + 6] * scale; // +24 float tmp_17 = this->elt_[2 * mi + 1][8 * ki + 7] * scale; // +25 // Pack to 4 registers. dst[ki][mi].reg(0) = fmha::float4_to_char4(tmp_00, tmp_01, tmp_02, tmp_03); dst[ki][mi].reg(1) = fmha::float4_to_char4(tmp_10, tmp_11, tmp_12, tmp_13); dst[ki][mi].reg(2) = fmha::float4_to_char4(tmp_04, tmp_05, tmp_06, tmp_07); dst[ki][mi].reg(3) = fmha::float4_to_char4(tmp_14, tmp_15, tmp_16, tmp_17); } } } uint32_t const params_scale_softmax_; }; //////////////////////////////////////////////////////////////////////////////////////////////////// // The softmax normalization statistics used by flash attention (l, m) template struct Softmax_statistics { // The shape of the MMA tile. using Mma_tile = typename Traits::template Mma_tile; // The number of MMAs in the M dimension. enum { MMAS_M = Mma_tile::MMAS_M }; // Ctor. template inline __device__ Softmax_statistics(Params const& params, void const* ptr, Binfo const& binfo, int tidx) : ptr_(reinterpret_cast(ptr)), seqlen_(binfo.actual_seqlen) { // The decomposition of the thread index into warp/lane. int warp = tidx / Cta_tile::THREADS_PER_WARP; int lane = tidx % Cta_tile::THREADS_PER_WARP; // The position of the the warp in the CTA. int warp_m = warp % Cta_tile::WARPS_M; // The position of the thread token_ = warp_m * Mma_tile::M_PER_MMA + lane / 4; // Compute the offset to the first token of the sequence. int64_t offset = binfo.bidb * params.h + binfo.bidh; // Move the pointer to the correct position. ptr_ += offset * params.lse_stride_in_bytes; } // Load the bias into registers (and expand). inline __device__ void load(int step) { #pragma unroll for (int mi = 0; mi < MMAS_M; ++mi) { #pragma unroll for (int ii = 0; ii < 2; ++ii) { // The index of the token. int token = token_; // At each iteration we jump over STEPQ elements. token += step * Cta_tile::M; // The extra offset inside the CTA. token += mi * Mma_tile::M_PER_MMA_PER_CTA + (ii & 0x1) * 8; // Fetch the value if the token is valid. float val = 0.0f; if (token < seqlen_) { val = reinterpret_cast(ptr_)[token]; } lm_[2 * mi + ii] = val; } } } // The pointer to the bias. int8_t const* ptr_; // The length of the sequence. int const seqlen_; // The token that this thread is loading. int token_; // The bias after expansion. float lm_[MMAS_M * 2]; }; } // namespace fmha