/*************************************************************************************************** * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ #pragma once #include // CUTE_HOST_DEVICE #include // cute::Tensor #include // cute::Copy_Atom namespace cute { // // copy_if -- Predicated Copy // template CUTE_HOST_DEVICE void copy_if(PrdTensor const& pred, Tensor const& src, Tensor & dst) { using SrcType = typename SrcEngine::value_type; using DstType = typename DstEngine::value_type; CUTE_UNROLL for (int i = 0; i < size(dst); ++i) { if (pred(i)) { dst(i) = static_cast(static_cast(src(i))); } } } // // copy_if -- Predicated CopyAtom // // Predicate Tensor is an Actual Tensor template CUTE_HOST_DEVICE void copy_if(Copy_Atom const& copy_atom, Tensor const& prd, // ([V],Rest...) Tensor const& src, // ( V, Rest...) Tensor & dst) // ( V, Rest...) { if constexpr (PrdLayout::rank == SrcLayout::rank - 1) { // Back-compat ONLY -- Delete? copy_if(copy_atom, make_tensor(prd.data(), prepend(prd.layout(), Layout<_1,_0>{})), src, dst); } else { static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch."); static_assert(SrcLayout::rank == PrdLayout::rank, "CopyAtom rank-mismatch."); if constexpr (SrcLayout::rank == 1) { // Dispatch the copy copy_atom.call(prd, src, dst); } else { // Loop over all but the first mode constexpr int R = SrcLayout::rank; Tensor prd_v = group_modes<1,R>(prd); Tensor src_v = group_modes<1,R>(src); Tensor dst_v = group_modes<1,R>(dst); CUTE_UNROLL for (int i = 0; i < size<1>(dst_v); ++i) { copy_atom.call(prd_v(_,i), src_v(_,i), dst_v(_,i)); } } } } template [[deprecated("Use a bool-tensor or transform-tensor as predication.")]] CUTE_HOST_DEVICE void copy_if(Copy_Atom const& copy_atom, PredTensor const& pred, // (Rest...) Tensor const& src, // (V,Rest...) Tensor & dst) // (V,Rest...) { Tensor tpred = cute::lazy::transform(make_tensor(counting_iterator{}, replace<0>(shape(dst), _1{})), pred); return copy_if(copy_atom, tpred, src, dst); } // // copy_if -- AutoCopyAsync // template CUTE_HOST_DEVICE void copy_if(AutoCopyAsync const& cpy, PrdTensor const& pred, Tensor const& src, Tensor & dst) { using SrcElemWithConst = remove_reference_t; using SrcType = typename SrcEngine::value_type; using DstType = typename DstEngine::value_type; auto copy_op = []() { #if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED) if constexpr (is_gmem::value && is_smem::value && sizeof(SrcType) == sizeof(DstType)) { if constexpr (is_const_v && sizeof(SrcType) == 16) { return SM80_CP_ASYNC_CACHEGLOBAL{}; } else if constexpr (sizeof(SrcType) == 4 || sizeof(SrcType) == 8 || sizeof(SrcType) == 16) { return SM80_CP_ASYNC_CACHEALWAYS{}; } else { return UniversalCopy{}; } } else { return UniversalCopy{}; } CUTE_GCC_UNREACHABLE; #else return UniversalCopy{}; #endif }(); CUTE_UNROLL for (int i = 0; i < size(dst); ++i) { if (pred(i)) { copy_op.copy(src(i), dst(i)); } } } // // copy -- AutoCopyAsync // template CUTE_HOST_DEVICE void copy(AutoCopyAsync const& cpy, Tensor const& src, // (V,Rest...) Tensor & dst) // (V,Rest...) { copy_if(cpy, constant_fn{}, src, dst); } // // copy -- CopyAtom // template CUTE_HOST_DEVICE void copy(Copy_Atom const& copy_atom, Tensor const& src, // (V,Rest...) Tensor & dst) // (V,Rest...) { static_assert(SrcLayout::rank == DstLayout::rank, "CopyAtom rank-mismatch."); if constexpr (SrcLayout::rank == 1) { // Dispatch the copy copy_atom.call(src, dst); } else { // Loop over all but the first mode constexpr int R = SrcLayout::rank; Tensor src_v = group_modes<1,R>(src); Tensor dst_v = group_modes<1,R>(dst); if constexpr (is_static::value && is_static::value) { CUTE_STATIC_ASSERT_V(size<1>(src_v) == size<1>(dst_v)); // AutoFilter on the Rest-mode auto dst_null = nullspace(layout<1>(dst_v)); Tensor dst_n = zipped_divide(dst_v, make_tile(shape<0>(dst_v), dst_null)); // ((V, NLL), (_1, Rest)) Tensor src_n = zipped_divide(src_v, make_tile(shape<0>(src_v), dst_null)); // ((V, NLL), (_1, Rest)) CUTE_STATIC_ASSERT_V(size<1>(src_n) == size<1>(dst_n)); CUTE_STATIC_ASSERT_V((cosize<0,1>(dst_n.layout()) == Int<1>{}), "Nullspace definition error"); CUTE_STATIC_ASSERT_V((cosize<0,1>(src_n.layout()) == Int<1>{}), "Error: Ambiguous scatter detected in copy"); CUTE_STATIC_ASSERT_V((size<1,0>(dst_n) == Int<1>{})); CUTE_STATIC_ASSERT_V((size<1,0>(src_n) == Int<1>{})); Tensor dst_c = dst_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_)); // (V, Rest) Tensor src_c = src_n(make_coord(_,Int<0>{}),make_coord(Int<0>{},_)); // (V, Rest) CUTE_STATIC_ASSERT_V( size<1>(src_c) == size<1>(dst_c)); CUTE_STATIC_ASSERT_V(shape<0>(dst_c) == shape<0>(dst)); CUTE_STATIC_ASSERT_V(shape<0>(src_c) == shape<0>(src)); CUTE_UNROLL for (int i = 0; i < size<1>(dst_c); ++i) { copy_atom.call(src_c(_,i), dst_c(_,i)); } } else { CUTE_UNROLL for (int i = 0; i < size<1>(dst_v); ++i) { copy_atom.call(src_v(_,i), dst_v(_,i)); } } } } //////////////////////////////////////////////////////// // Special Auto-Vectorizing, Auto-Filtering Overloads // //////////////////////////////////////////////////////// // Specialization for AutoVectorizingCopyAssumedAlignment template CUTE_HOST_DEVICE void copy(AutoVectorizingCopyWithAssumedAlignment const&, Tensor const& src, Tensor & dst) { constexpr int common_elem = CUTE_STATIC_V(max_common_vector(src, dst)); static_assert(is_integral{} * sizeof_bits_v)>::value, "Error: Attempting a subbit write!"); if constexpr (common_elem > 1) { constexpr int align_bits = CUTE_STATIC_V(gcd(max_alignment(src), max_alignment(dst), Int{})); constexpr int vec_bits = gcd(common_elem * sizeof_bits_v, align_bits); if constexpr ((vec_bits % 8) == 0 && sizeof_bits_v < Int{}) { // If more than one element vectorizes to a multiple of 8bits that is larger than the value_type, then recast and copy using VecType = uint_bit_t; // Recast Tensor src_v = recast(src); Tensor dst_v = recast(dst); return copy_if(constant_fn{}, src_v, dst_v); } else { return copy_if(constant_fn{}, src, dst); } } else { return copy_if(constant_fn{}, src, dst); } } template struct AutoFilter { Base const& base; CUTE_HOST_DEVICE AutoFilter(Base const& b) : base(b) {} }; // Specialization for AutoFilter template CUTE_HOST_DEVICE void copy(AutoFilter const& copy_op, Tensor const& src, Tensor & dst) { if constexpr (is_constant::value) { auto dst_null = nullspace(dst.layout()); Tensor dst_n = zipped_divide(dst, dst_null); Tensor src_n = zipped_divide(src, dst_null); CUTE_STATIC_ASSERT_V(cosize<0>(dst_n.layout()) == Int<1>{}, "Nullspace definition error"); CUTE_STATIC_ASSERT_V(cosize<0>(src_n.layout()) == Int<1>{}, "Error: Ambiguous race-condition detected."); copy(copy_op.base, src_n(Int<0>{},_), dst_n(Int<0>{},_)); } else { copy(copy_op.base, src, dst); } } // Auto-vectorizing copy for static layouts template CUTE_HOST_DEVICE void copy(Tensor const& src, Tensor & dst) { if constexpr (is_static::value && is_static::value) { // Assume Tensors with static layouts (e.g. registers) have pointers that are 128b aligned return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<128>{}), src, dst); } else if constexpr (is_static::value && is_static::value) { // Tensors with static shapes can be filtered, but do not assume that dynamic layouts are aligned. return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<8>{}), src, dst); } else { // Do not assume that dynamic layouts are aligned. return copy(AutoVectorizingCopyWithAssumedAlignment<8>{}, src, dst); } } // Auto-vectorizing copy with assumed alignment up to 128bit. template CUTE_HOST_DEVICE void copy_aligned(Tensor const& src, Tensor & dst) { if constexpr (is_static::value && is_static::value) { // Tensors with static shapes can be filtered return copy(AutoFilter(AutoVectorizingCopyWithAssumedAlignment<128>{}), src, dst); } else { return copy(AutoVectorizingCopyWithAssumedAlignment<128>{}, src, dst); } } // Specializaton for Atom AutoVectorizingCopyAssumedAlignment template CUTE_HOST_DEVICE void copy(Copy_Atom, Args...> const&, Tensor const& src, Tensor & dst) { return copy(AutoVectorizingCopyWithAssumedAlignment{}, src, dst); } template CUTE_HOST_DEVICE void copy(Copy_Atom>, Args...> const&, Tensor const& src, Tensor & dst) { return copy(AutoVectorizingCopyWithAssumedAlignment{}, src, dst); } #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED) template CUTE_HOST_DEVICE void copy(Copy_Traits const& atom, // Copy_Traits may or may not have the memory barrier in it already Tensor const& src, Tensor & dst) { using SrcType = typename SrcEngine::value_type; using DstType = typename DstEngine::value_type; static_assert(cute::is_same::value); static_assert((is_gmem::value && is_smem::value) || (is_smem::value && is_gmem::value), "Bulk Copy only supports gmem -> smem or smem -> gmem movement."); // G2S or S2G dispatch using BULK_COPY_OP = conditional_t::value, SM90_BULK_COPY_G2S, SM90_BULK_COPY_S2G>; // Find the common subtensor of src and dst auto tiler = max_common_layout(src, dst); constexpr int vec_elem = decltype(size(tiler))::value; constexpr int vec_bits = vec_elem * sizeof_bits_v; static_assert(vec_bits >= 128, "Expected at least 128-bits for BLKCP"); // Construct a new concrete Atom of the vector size using BulkAtom = Copy_Atom, CT_Args...>, SrcType>; auto bulk_atom = apply(atom.opargs_, [](auto const&... args) { return BulkAtom{args...}; }); return copy(bulk_atom, logical_divide(src, tiler), logical_divide(dst, tiler)); } // Backwards-compat. Throw out any extra Copy_Atom args. template CUTE_HOST_DEVICE void copy(Copy_Atom, CA_Args...> const& atom, Tensor const& src, Tensor & dst) { return copy(static_cast const&>(atom), src, dst); } #endif // #if defined(CUTE_COPY_ATOM_TMA_SM90_ENABLED) // // Decay TiledCopy to CopyAtom // template CUTE_HOST_DEVICE void copy_if(TiledCopy const& tiled_copy, PrdTensor const& pred, Tensor const& src, Tensor & dst) { return copy_if(static_cast(tiled_copy), pred, src, dst); } template CUTE_HOST_DEVICE void copy(TiledCopy const& tiled_copy, Tensor const& src, Tensor & dst) { return copy(static_cast(tiled_copy), src, dst); } template CUTE_HOST_DEVICE void copy_if(ThrCopy const& thr_copy, PrdTensor const& pred, Tensor const& src, Tensor & dst) = delete; template CUTE_HOST_DEVICE void copy(ThrCopy const& thr_copy, Tensor const& src, Tensor & dst) = delete; // // Catch uncaught policies // template CUTE_HOST_DEVICE void copy_if(CopyPolicy const& cpy, PredTensor const& prd, Tensor const& src, Tensor & dst) { static_assert(dependent_false, "Unrecognized CopyPolicy."); } template CUTE_HOST_DEVICE void copy(CopyPolicy const& cpy, Tensor const& src, Tensor & dst) { static_assert(dependent_false, "Unrecognized CopyPolicy."); } // // Accept mutable temporaries // template CUTE_HOST_DEVICE void copy_if(PrdTensor const& pred, Tensor const& src, Tensor && dst) { return copy_if(pred, src, dst); } template CUTE_HOST_DEVICE void copy_if(CopyPolicy const& copy_policy, PrdTensor const& pred, Tensor const& src, Tensor && dst) { return copy_if(copy_policy, pred, src, dst); } template CUTE_HOST_DEVICE void copy(Tensor const& src, Tensor && dst) { return copy(src, dst); } template CUTE_HOST_DEVICE void copy(CopyPolicy const& copy_policy, Tensor const& src, Tensor && dst) { return copy(copy_policy, src, dst); } template CUTE_HOST_DEVICE void copy_aligned(Tensor const& src, Tensor && dst) { return copy_aligned(src, dst); } } // end namespace cute