/*************************************************************************************************** * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include "cutlass/fast_math.h" #include "cutlass/gemm_coord.hpp" #include "cutlass/kernel_hardware_info.hpp" #include "cutlass/gemm/kernel/tile_scheduler_params.h" #include "cute/layout.hpp" #include "cute/tensor.hpp" #include "cute/arch/cluster_sm90.hpp" #include "cutlass/pipeline/pipeline.hpp" namespace cutlass::gemm::kernel::detail { /////////////////////////////////////////////////////////////////////////////// // Users are not supposed to use this class directly. // This is a CRTP base class for the actual tile schedulers. template class StaticPersistentTileScheduler { private: uint64_t current_work_linear_idx_; uint64_t total_grid_size_; public: struct WorkTileInfo { int32_t M_idx = 0; int32_t N_idx = 0; int32_t L_idx = 0; bool is_valid_tile = false; CUTLASS_HOST_DEVICE bool is_valid() const { return is_valid_tile; } CUTLASS_HOST_DEVICE static WorkTileInfo invalid_work_tile() { return {-1, -1, -1, false}; } CUTLASS_HOST_DEVICE bool is_final_split(uint32_t k_tiles_per_output_tile) const { return true; } CUTLASS_HOST_DEVICE int32_t reduction_subtile_idx() const { return -1; } }; using Params = PersistentTileSchedulerSm90Params; using RasterOrder = typename Params::RasterOrder; using RasterOrderOptions = typename Params::RasterOrderOptions; static constexpr bool IsDynamicPersistent = false; public: struct Arguments { int max_swizzle_size = 1; RasterOrderOptions raster_order = RasterOrderOptions::Heuristic; }; template static Params to_underlying_arguments( ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape, ClusterShape cluster_shape, [[maybe_unused]] KernelHardwareInfo const& hw_info, Arguments const& arguments, [[maybe_unused]] void* workspace=nullptr, [[maybe_unused]] const uint32_t epilogue_subtile = 1, [[maybe_unused]] uint32_t ktile_start_alignment_count = 1u) { // We only need the tile and cluster shape during scheduler setup, so let FTAD do the magic static_assert(cute::is_static::value); static_assert(cute::is_static::value); dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape, cluster_shape); Params params; params.initialize( problem_blocks, to_gemm_coord(cluster_shape), hw_info, arguments.max_swizzle_size, arguments.raster_order ); return params; } CUTLASS_HOST_DEVICE static bool can_implement(Arguments const& args) { return args.max_swizzle_size >= 0; } CUTLASS_HOST_DEVICE StaticPersistentTileScheduler() { } CUTLASS_DEVICE explicit StaticPersistentTileScheduler(Params const& params_) : scheduler_params(params_) { // MSVC requires protecting use of CUDA-specific nonstandard syntax, // like blockIdx and gridDim, with __CUDA_ARCH__. #if defined(__CUDA_ARCH__) if (params_.raster_order_ == RasterOrder::AlongN) { current_work_linear_idx_ = uint64_t(blockIdx.x) + uint64_t(blockIdx.y) * uint64_t(gridDim.x); } else { current_work_linear_idx_ = uint64_t(blockIdx.x) * uint64_t(gridDim.y) + uint64_t(blockIdx.y); } total_grid_size_ = uint64_t(gridDim.x) * uint64_t(gridDim.y) * uint64_t(gridDim.z); #else CUTLASS_ASSERT(false && "This line should never be reached"); #endif } // Returns the initial work tile info that will be computed over template CUTLASS_DEVICE WorkTileInfo initial_work_tile_info(ClusterShape cluster_shape) { return get_current_work(); } CUTLASS_DEVICE WorkTileInfo get_current_work() const { return get_current_work_for_linear_idx(current_work_linear_idx_); } CUTLASS_DEVICE WorkTileInfo get_current_work_for_linear_idx(uint64_t linear_idx) const { if (linear_idx >= scheduler_params.blocks_per_problem_) { return WorkTileInfo::invalid_work_tile(); } // Map worker's linear index into the CTA tiled problem shape to the corresponding MNL indices uint64_t work_idx_l, remainder; scheduler_params.divmod_batch_(work_idx_l, remainder, linear_idx); uint64_t blk_per_grid_dim = scheduler_params.divmod_cluster_shape_minor_.divide(remainder); auto [work_idx_m, work_idx_n] = Subclass::get_work_idx_m_and_n(blk_per_grid_dim, scheduler_params.divmod_cluster_shape_major_, scheduler_params.divmod_cluster_shape_minor_, scheduler_params.divmod_cluster_blk_major_, scheduler_params.log_swizzle_size_, scheduler_params.raster_order_); return {work_idx_m, work_idx_n, static_cast(work_idx_l), true}; } CUTLASS_DEVICE void advance_to_next_work(uint32_t advance_count = 1) { current_work_linear_idx_ += total_grid_size_ * uint64_t(advance_count); } CUTLASS_DEVICE bool is_last_tile(WorkTileInfo& work_tile_info, uint32_t advance_count = 1) const { if (continue_current_work(work_tile_info)) { return false; } return not get_current_work_for_linear_idx( current_work_linear_idx_ + (total_grid_size_ * uint64_t(advance_count)) ).is_valid(); } // Computes the linear index within a batch given M and N tile offsets within the batch. // This essentially inverts the mapping performed in get_work_idx_m_and_n static CUTLASS_DEVICE uint64_t get_linear_idx_from_m_and_n( int32_t tile_m, int32_t tile_n, FastDivmodU64Pow2 const& divmod_cluster_shape_major, FastDivmodU64Pow2 const& divmod_cluster_shape_minor, FastDivmodU64 const& divmod_cluster_blk_major, int32_t log_swizzle_size, RasterOrder raster_order) { uint64_t minor_work_idx, major_work_idx, cluster_minor_offset; if (raster_order == RasterOrder::AlongN) { minor_work_idx = static_cast(tile_m); major_work_idx = static_cast(tile_n); uint64_t cluster_m = divmod_cluster_shape_minor.divide(tile_m) * divmod_cluster_shape_minor.divisor; cluster_minor_offset = tile_m - cluster_m; } else { major_work_idx = static_cast(tile_m); minor_work_idx = static_cast(tile_n); uint64_t cluster_n = divmod_cluster_shape_minor.divide(tile_n) * divmod_cluster_shape_minor.divisor; cluster_minor_offset = tile_n - cluster_n; } uint64_t cluster_idx_minor, cluster_idx_major, cluster_major_offset; cluster_idx_minor = divmod_cluster_shape_minor.divide(minor_work_idx - cluster_minor_offset); divmod_cluster_shape_major(cluster_idx_major, cluster_major_offset, major_work_idx); uint64_t cluster_idx_minor_div_swizzle = cluster_idx_minor >> log_swizzle_size; uint64_t offset = cluster_idx_minor & ((1 << log_swizzle_size) - 1); uint64_t extra = cluster_idx_minor_div_swizzle * divmod_cluster_blk_major.divisor + cluster_idx_major; uint64_t cluster_id = (extra << log_swizzle_size) | offset; return (cluster_id * divmod_cluster_shape_major.divisor + cluster_major_offset) * divmod_cluster_shape_minor.divisor + cluster_minor_offset; } // Given the inputs, computes the total number of output blocks over which this problem will compute. // Note that this is only the logical size of our grid, not the physical grid we will actually launch. template CUTLASS_HOST_DEVICE static dim3 get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, BlockShape cta_shape, ClusterShape cluster_shape) { auto cta_m = cute::size(cute::ceil_div(cute::shape<0>(problem_shape_mnkl), cute::shape<0>(cta_shape))); auto cta_n = cute::size(cute::ceil_div(cute::shape<1>(problem_shape_mnkl), cute::shape<1>(cta_shape))); return Params::get_tiled_cta_shape_mnl( to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cluster_shape), cta_m, cta_n ); } // Reloaded interface that receives WorkTileInfo to deduce next work. // Kernel helper function to get next work tile CUTLASS_DEVICE auto fetch_next_work(WorkTileInfo work_tile_info) { if (continue_current_work(work_tile_info)) { return cute::make_tuple(work_tile_info, true); } advance_to_next_work(); return cute::make_tuple(get_current_work(), true); } // Given the inputs, computes the total number of output blocks over which this problem will compute. // Note that this is only the logical size of our grid, not the physical grid we will actually launch. template CUTLASS_HOST_DEVICE static dim3 get_tiled_cta_shape_mnl(ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape_mnk, AtomThrShape atom_thr_shape_mnk, ClusterShape cluster_shape_mnk) { auto [tiles_m, tiles_n, tiles_l] = product_each(ceil_div(select<0,1,3>(problem_shape_mnkl), take<0,2>(tile_shape_mnk))); auto cta_m = round_nearest(tiles_m * size<0>(atom_thr_shape_mnk), size<0>(cluster_shape_mnk)); auto cta_n = round_nearest(tiles_n * size<1>(atom_thr_shape_mnk), size<1>(cluster_shape_mnk)); return Params::get_tiled_cta_shape_mnl( to_gemm_coord(problem_shape_mnkl), to_gemm_coord(cluster_shape_mnk), cta_m, cta_n ); } // Kernel helper function to get next work tile template CUTLASS_DEVICE auto fetch_next_work( WorkTileInfo work_tile_info, TileSchedulerPipeline& scheduler_pipeline, TileSchedulerPipelineState scheduler_pipe_consumer_state) { return fetch_next_work(work_tile_info); } CUTLASS_DEVICE static auto work_tile_to_cta_coord(WorkTileInfo work_tile_info) { // Get every cta coord in three dimensions of the cluster auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster(); return make_coord( work_tile_info.M_idx + static_cast(cta_m_in_cluster), work_tile_info.N_idx + static_cast(cta_n_in_cluster), _, work_tile_info.L_idx + static_cast(cta_l_in_cluster) ); } CUTLASS_DEVICE static auto work_tile_to_cta_coord(WorkTileInfo work_tile_info, dim3 block_id_in_cluster) { // Get every cta coord in three dimensions of the cluster auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = block_id_in_cluster; return make_coord( work_tile_info.M_idx + static_cast(cta_m_in_cluster), work_tile_info.N_idx + static_cast(cta_n_in_cluster), _, work_tile_info.L_idx + static_cast(cta_l_in_cluster) ); } // Given the inputs, computes the physical grid we should launch. template CUTLASS_HOST_DEVICE static dim3 get_grid_shape( [[maybe_unused]] Params const& params, ProblemShapeMNKL problem_shape_mnk, BlockShape cta_shape, ClusterShape cluster_shape, KernelHardwareInfo hw_info, Arguments arguments = Arguments{}, bool truncate_by_problem_size=true) { auto problem_shape_mnkl = cute::append<4>(problem_shape_mnk, cute::Int<1>{}); dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, cta_shape, cluster_shape); return Params::get_grid_shape( problem_blocks, to_gemm_coord(cluster_shape), hw_info, arguments.max_swizzle_size, arguments.raster_order, /* truncate_by_problem_size = */true ); } // Given the inputs, computes the physical grid we should launch. template static dim3 get_grid_shape( Params const& params, ProblemShapeMNKL problem_shape_mnkl, TileShape tile_shape_mnk, AtomThrShape atom_thr_shape_mnk, ClusterShape cluster_shape_mnk, KernelHardwareInfo hw_info) { dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape_mnkl, tile_shape_mnk, atom_thr_shape_mnk, cluster_shape_mnk); Arguments args{}; if constexpr (!std::is_const_v) { args.max_swizzle_size = 1 << params.log_swizzle_size_; } args.raster_order = params.raster_order_ == RasterOrder::AlongN ? RasterOrderOptions::AlongN : RasterOrderOptions::AlongM; return Params::get_grid_shape( problem_blocks, to_gemm_coord(cluster_shape_mnk), hw_info, args.max_swizzle_size, args.raster_order, /* truncate_by_problem_size = */true ); } // Convert CTA-level work tile info to cluster-level tile coord CUTLASS_DEVICE auto work_tile_to_cluster_coord_mnkl(WorkTileInfo work_tile_info) const { // TileScheduler works at CTA-level, kernel works at cluster-level int m_coord = idx2crd(work_tile_info.M_idx / scheduler_params.cluster_shape_m_, scheduler_params.problem_tiles_m_); int n_coord = idx2crd(work_tile_info.N_idx / scheduler_params.cluster_shape_n_, scheduler_params.problem_tiles_n_); int l_coord = idx2crd(work_tile_info.L_idx, scheduler_params.problem_tiles_l_); return make_coord(m_coord, n_coord, _, l_coord); } // Returns whether the block assigned this work should compute the epilogue for the corresponding // output tile. For the basic tile scheduler, this is always true. CUTLASS_HOST_DEVICE static bool compute_epilogue(WorkTileInfo const&, Params const&) { return true; } CUTLASS_HOST_DEVICE static bool compute_epilogue(WorkTileInfo const&) { return true; } // Performs the reduction across splits for a given output tile. Since this scheduler does // not split output tiles, no reduction is needed. template CUTLASS_DEVICE static void fixup(Params const&, WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) {} // Performs the reduction across splits for a given output tile. No fixup is required for // work units returned by this scheduler. template CUTLASS_DEVICE void fixup(WorkTileInfo const&, FrgTensorC&, uint32_t, uint32_t) const { } // Returns whether the current WorkTileInfo passed in should continue to be used. Since // this scheduler only schedules work in units of single, full output tiles, the WorkTileInfo // passed in should not be used after having been processed. CUTLASS_DEVICE static bool continue_current_work(WorkTileInfo&) { return false; } template CUTLASS_DEVICE auto get_k_tile_iterator(WorkTileInfo const& work_tile_info, ProblemShapeMNKL problem_shape_MNKL, TileShape tile_shape, Shape) { auto k_tiles = cute::ceil_div(cute::get<2>(problem_shape_MNKL), cute::get<2>(tile_shape)); return cute::make_coord_iterator(k_tiles); } template CUTLASS_HOST_DEVICE static int get_work_k_tile_count(WorkTileInfo const& work_tile_info, ProblemShape problem_shape, TileShape tile_shape) { // All work units returned by this scheduler cover the entire K iteration // space of the output tile assigned to the work unit. return cute::size(cute::ceil_div(cute::get<2>(problem_shape), cute::get<2>(tile_shape))); } CUTLASS_HOST_DEVICE static uint32_t get_work_k_tile_start(WorkTileInfo const&) { // All work units returned by this scheduler start from K tile 0 return 0u; } CUTLASS_DEVICE static bool need_separate_reduction(Params const& params) { return false; } CUTLASS_DEVICE bool is_work_tile_for_reduction(WorkTileInfo const& work_tile_info, Params const& params) { return false; } template CUTLASS_DEVICE void separate_reduction( Params const& params, WorkTileInfo const& work_tile_info, FrgTensorC& accumulators, uint32_t num_barriers, uint32_t barrier_idx) { } // Shares the accumulator set with peers in the global workspace template CUTLASS_DEVICE static void share( Params const& params, WorkTileInfo const& work_tile_info, FrgTensorC& accumulators, uint32_t num_barriers, uint32_t barrier_idx) { } CUTLASS_DEVICE static bool valid_warpgroup_in_work_tile(WorkTileInfo const& work_tile_info) { return true; } CUTLASS_DEVICE static bool requires_separate_reduction(Params const& params) { return false; } public: // Sink scheduler params as a member Params scheduler_params; }; } // namespace cutlass::gemm::kernel::detail