/*************************************************************************************************** * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: BSD-3-Clause * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * **************************************************************************************************/ /*! \file \brief Boost-like numeric conversion operator for CUTLASS numeric types */ #pragma once #if !defined(__CUDACC_RTC__) #include #endif #include "cutlass/cutlass.h" #include "cutlass/numeric_types.h" #include "cutlass/transform/thread/unary_op.h" #include "cutlass/array.h" #include "cutlass/half.h" #include "cutlass/bfloat16.h" namespace cutlass { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Floating-point rounding style similar to Standard Library's formats but supporting /// additional rounding options. enum class FloatRoundStyle { round_indeterminate, ///< rounding mode unknown round_toward_zero, ///< round toward zero round_to_nearest, ///< round to nearest even round_to_nearest_satfinite, ///< round to nearest even, capping value to min and max of destination type round_toward_infinity, ///< round toward infinity round_toward_neg_infinity, ///< round toward negative infinity round_half_ulp_truncate, ///< add 0.5ulp to integer representation then round toward zero round_half_ulp_trunc_dntz ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero }; ///////////////////////////////////////////////////////////////////////////////////////////////// template < typename T, typename S, FloatRoundStyle Round = FloatRoundStyle::round_to_nearest > struct NumericConverter { using result_type = T; using source_type = S; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return static_cast(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float => int32_t // ///////////////////////////////////////////////////////////////////////////////////////////////// template <> struct NumericConverter { using result_type = int32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if __CUDA_ARCH__ return __float2int_rn(s); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TONEAREST); return static_cast(std::nearbyint(s)); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = int32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if __CUDA_ARCH__ return __float2int_rz(s); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TOWARDZERO); return (result_type)std::nearbyint(s); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float => int8_t // ///////////////////////////////////////////////////////////////////////////////////////////////// template <> struct NumericConverter { using result_type = int8_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if defined(__CUDA_ARCH__) int32_t intermediate; asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s)); return static_cast(intermediate); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TONEAREST); int32_t intermediate = (int32_t)std::nearbyint(s); // Low-end saturation intermediate = std::max(intermediate, (int32_t)std::numeric_limits::lowest()); // High-end saturation intermediate = std::min(intermediate, (int32_t)std::numeric_limits::max()); return static_cast(intermediate); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = int8_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if defined(__CUDA_ARCH__) int32_t intermediate; asm volatile("cvt.rzi.sat.s8.f32 %0, %1;" : "=r"(intermediate) : "f"(s)); return static_cast(intermediate); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TOWARDZERO); int32_t intermediate = (int32_t)std::nearbyint(s); // Low-end saturation intermediate = std::max(intermediate, (int32_t)std::numeric_limits::lowest()); // High-end saturation intermediate = std::min(intermediate, (int32_t)std::numeric_limits::max()); return static_cast(intermediate); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = uint8_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if defined(__CUDA_ARCH__) int32_t intermediate; asm volatile("cvt.rni.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s)); return static_cast(intermediate); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TONEAREST); int32_t intermediate = (int32_t)std::nearbyint(s); // Low-end saturation intermediate = std::max(intermediate, (int32_t)std::numeric_limits::lowest()); // High-end saturation intermediate = std::min(intermediate, (int32_t)std::numeric_limits::max()); return static_cast(intermediate); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = uint8_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if __CUDA_ARCH__ int32_t intermediate; asm volatile("cvt.rzi.sat.u8.f32 %0, %1;" : "=r"(intermediate) : "f"(s)); return static_cast(intermediate); #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TOWARDZERO); int32_t intermediate = (int32_t)std::nearbyint(s); // Low-end saturation intermediate = std::max(intermediate, (int32_t)std::numeric_limits::lowest()); // High-end saturation intermediate = std::min(intermediate, (int32_t)std::numeric_limits::max()); return static_cast(intermediate); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for cutlass::half_t => int8_t // ///////////////////////////////////////////////////////////////////////////////////////////////// template <> struct NumericConverter { using result_type = int8_t; using source_type = cutlass::half_t; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { #if defined(__CUDA_ARCH__) union { int8_t int8[2]; int16_t int16; }; union { cutlass::half_t fp16; int16_t int16_in; }; fp16 = s; asm volatile ("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in)); return int8[0]; #elif !defined(__CUDACC_RTC__) std::fesetround(FE_TONEAREST); int32_t intermediate = (int32_t)std::nearbyint(static_cast(s)); // Low-end saturation intermediate = std::max(intermediate, (int32_t)std::numeric_limits::lowest()); // High-end saturation intermediate = std::min(intermediate, (int32_t)std::numeric_limits::max()); return static_cast(intermediate); #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float => integer_subbyte // ///////////////////////////////////////////////////////////////////////////////////////////////// template struct NumericConverter, float, Round> { private: static constexpr bool result_is_signed = true; public: using result_type = integer_subbyte; using source_type = float; static constexpr FloatRoundStyle round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const& src) { using middle_type = int; static_assert(8 * sizeof(middle_type) > Bits, "This conversion " "requires that integer_subbyte have fewer representation bits " "than the number of bits in int."); auto middle = NumericConverter::convert(src); return NumericConverter::convert(middle); } CUTLASS_HOST_DEVICE result_type operator()(source_type const& s) const { return convert(s); } }; template struct NumericConverter, float, Round> { private: static constexpr bool result_is_signed = false; public: using result_type = integer_subbyte; using source_type = float; static constexpr FloatRoundStyle round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const& src) { using middle_type = unsigned; static_assert(8 * sizeof(middle_type) > Bits, "This conversion " "requires that integer_subbyte have fewer representation bits " "than the number of bits in unsigned int."); auto middle = NumericConverter::convert(src); return NumericConverter::convert(middle); } CUTLASS_HOST_DEVICE result_type operator()(source_type const& s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for float <= cutlass::half_t template struct NumericConverter { using result_type = T; using source_type = T; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return s; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float <=> cutlass::half_t // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for float <= cutlass::half_t template struct NumericConverter { using result_type = float; using source_type = cutlass::half_t; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { result_type result = static_cast(s); return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Specialization for round-to-nearest template <> struct NumericConverter { using result_type = cutlass::half_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { result_type result = static_cast(s); return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Specialization for round-toward-zero template <> struct NumericConverter { using result_type = cutlass::half_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; /// Round toward zero CUTLASS_HOST_DEVICE static result_type convert(source_type const & flt) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) return cutlass::half_t(__float2half_rz(flt)); #else // software implementation rounds toward nearest even unsigned const& s = reinterpret_cast(flt); uint16_t sign = uint16_t((s >> 16) & 0x8000); int32_t exp = int32_t((s >> 23) & 0xff) - 127; int mantissa = s & 0x7fffff; uint16_t u = 0; if ((s & 0x7fffffff) == 0) { // sign-preserving zero return cutlass::half_t::bitcast(sign); } if (exp > 15) { if (exp == 128 && mantissa) { // not a number u = 0x7fff; } else { // overflow to infinity u = sign | 0x7c00; } return cutlass::half_t::bitcast(u); } if (exp >= -14) { // normal fp32 to normal fp16 u = uint16_t((uint32_t(exp + 15) & 0x1f) << 10); u = uint16_t(u | (mantissa >> 13)); } else { // normal single-precision to subnormal cutlass::half_t-precision representation int rshift = (-14 - exp); if (rshift < 32) { mantissa |= (1 << 23); mantissa = (mantissa >> rshift); u = (uint16_t(mantissa >> 13) & 0x3ff); } else { mantissa = 0; u = 0; } } u |= sign; return cutlass::half_t::bitcast(u); #endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float <=> cutlass::bfloat16_t // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for float <= cutlass::bfloat16_t template struct NumericConverter { using result_type = float; using source_type = cutlass::bfloat16_t; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return static_cast(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::bfloat16_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return static_cast(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::bfloat16_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { uint32_t x32 = reinterpret_cast(s); #if defined(__CUDA_ARCH__) if (::isfinite(s)) { x32 += 0x8000; } #else if (std::isfinite(s)) { x32 += 0x8000; } #endif uint16_t x16 = uint16_t((x32 >> 16) & 0xffff); return cutlass::bfloat16_t::bitcast(x16); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::bfloat16_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { uint32_t x32 = reinterpret_cast(s); uint16_t x16 = uint16_t(x32 >> 16); return cutlass::bfloat16_t::bitcast(x16); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for float <=> cutlass::tfloat32_t // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for float <= cutlass::tfloat32_t template struct NumericConverter { using result_type = float; using source_type = cutlass::tfloat32_t; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return static_cast(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::tfloat32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { unsigned storage = reinterpret_cast(s); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 asm volatile("cvt.rn.tf32.f32 %0, %1;" : "=r"(storage) : "r"(storage)); #else if ((storage & 0x7f800000) != 0x7f800000) { bool mantissa_bit = ((storage & (1 << 13)) != 0); bool round_bit = ((storage & (1 << 12)) != 0); bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0); if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) { storage += uint32_t(1 << 13); } // Note, the following is intentionally commented out. TF32 // does not define the low order bits, so they may be left in // an undefined state. // // By not truncating these bit explicitly, we avoid an extra logical // operation. // // TF32 may be implicitly converted to float by performing this // operation as needed. // // storage = (storage & ~0x1fff); } else if (storage & ~0xff800000) { storage = 0x7fffffff; } #endif return cutlass::tfloat32_t::bitcast(storage); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::tfloat32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { return cutlass::tfloat32_t::round_half_ulp_truncate(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero. /// It avoids predicated code, though it requires a temporary register. template <> struct NumericConverter { using result_type = cutlass::tfloat32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { unsigned y = reinterpret_cast(s); y = y & 0xff800000; float d = reinterpret_cast(y); float z = d / float(1 << 11) + s; return reinterpret_cast(z); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template <> struct NumericConverter { using result_type = cutlass::tfloat32_t; using source_type = float; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { uint32_t x = reinterpret_cast(s); return cutlass::tfloat32_t::bitcast(x & 0xffffe000); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Conversion operator for float to cutlass::tfloat32_t big and small values // ///////////////////////////////////////////////////////////////////////////////////////////////// template < FloatRoundStyle RoundBig = FloatRoundStyle::round_toward_zero, FloatRoundStyle RoundSmall = FloatRoundStyle::round_half_ulp_truncate > struct NumericConverterFastF32 { // result_type holds big cutlass::tfloat32_t at idx(0) and small cutlass::tfloat32_t at idx(1) using result_type = Array; // source data type using source_type = float; // rounding styles for big and small part static FloatRoundStyle const kRoundBig = RoundBig; static FloatRoundStyle const kRoundSmall = RoundSmall; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { result_type result; NumericConverter convert_big_; NumericConverter convert_small_; // convert and fill cutlass::tfloat32_t big at idx 0 result[0] = convert_big_(source); // convert and fill cutlass::tfloat32_t small at idx 1 result[1] = convert_small_(source - static_cast(result[0])); return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Conversion and Clamp operator for Integers // ///////////////////////////////////////////////////////////////////////////////////////////////// template < typename T, typename S > struct NumericConverterClamp { using result_type = T; using source_type = S; CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { NumericConverter convert_op; result_type const kClamp_max = cutlass::platform::numeric_limits::max(); result_type const kClamp_min = cutlass::platform::numeric_limits::lowest(); if (s < (source_type)kClamp_min) return kClamp_min; if (s > (source_type)kClamp_max) return kClamp_max; return convert_op(s); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; // This converter is needed to enable cutlass::half_t output types when using int32_t accumulators. // Since floating-point types do not require a clamp, this converter simply casts from // the source type to cutlass::half_t. template < typename S > struct NumericConverterClamp { using result_type = cutlass::half_t; using source_type = S; CUTLASS_HOST_DEVICE static result_type convert(source_type const &source) { return static_cast(source); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Conversion operator for Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Conversion operator for Array template < typename T, typename S, int N, FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, typename Transform = cutlass::transform::thread::UnaryTransform::Identity > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; static_assert(platform::is_same::value || platform::is_same::value, "Unary Operator not supported."); CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { result_type result; NumericConverter convert_; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N; ++i) { if (platform::is_same::value) { result[i] = convert_(s[i]); } else { // conjugate result[i] = conj(convert_(s[i])); } } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template < typename T, int N, FloatRoundStyle Round, typename Transform > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; static_assert(platform::is_same::value || platform::is_same::value, "Unary Operator not supported."); CUTLASS_HOST_DEVICE static result_type convert(source_type const &source) { if (platform::is_same::value) { return source; } else { result_type result; for (int i = 0; i < N; ++i) { result[i] = conj(static_cast(source[i])); } return result; } } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array, round to nearest template <> struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) Array result; reinterpret_cast<__half2 &>(result) = __float22half2_rn(reinterpret_cast(source)); return result; #else NumericConverter convert_; // NOTE: cutlass::Array is NOT an aggregate type and // below `{}` does NOT conduct zero initialization. Below `{}` will // conduct default initialization (calling default ctr). We use this syntax // to resolve compiler warning on uninitialized member variable. Array result{}; result[0] = convert_(source[0]); result[1] = convert_(source[1]); return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array, round to nearest template struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) float2 result2 = __half22float2(reinterpret_cast<__half2 const &>(source)); return { float{result2.x}, float{result2.y} }; #else NumericConverter convert_; return { convert_(source[0]), convert_(source[1]) }; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < int N, FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericArrayConverter convert_vector_; NumericConverter convert_element_; result_type result; Array *result_ptr = reinterpret_cast *>(&result); Array const *source_ptr = reinterpret_cast const *>(&source); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N / 2; ++i) { result_ptr[i] = convert_vector_(source_ptr[i]); } if (N % 2) { result[N - 1] = convert_element_(source[N - 1]); } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < int N, FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericArrayConverter convert_vector_; NumericConverter convert_element_; result_type result; Array *result_ptr = reinterpret_cast *>(&result); Array const *source_ptr = reinterpret_cast const *>(&source); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N / 2; ++i) { result_ptr[i] = convert_vector_(source_ptr[i]); } if (N % 2) { result[N - 1] = convert_element_(source[N - 1]); } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array, round to nearest template <> struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { unsigned d; asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) ); return reinterpret_cast(d); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array, round to nearest with min/max saturation template <> struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest_satfinite; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { unsigned d; asm("cvt.rn.satfinite.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) ); return reinterpret_cast(d); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < int N, FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericArrayConverter convert_vector_; NumericConverter convert_element_; result_type result; Array *result_ptr = reinterpret_cast *>(&result); Array const *source_ptr = reinterpret_cast const *>(&source); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N / 2; ++i) { result_ptr[i] = convert_vector_(source_ptr[i]); } if (N % 2) { result[N - 1] = convert_element_(source[N - 1]); } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; #endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \ ((__CUDACC_VER_MAJOR__ > 10) || \ ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericConverter convert_element_; result_type result; result[0] = convert_element_(source[0]); return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { uint32_t tmp; asm volatile( "cvt.pack.sat.s8.s32.b32 %0, %2, %1, 0;\n" : "=r"(tmp) : "r"(source[0]), "r"(source[1])); uint16_t out = (tmp & 0xffff); return reinterpret_cast(out); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { unsigned out; asm volatile( "{ .reg .u32 r4;" "cvt.pack.sat.s8.s32.b32 r4, %4, %3, 0;" "cvt.pack.sat.s8.s32.b32 %0, %2, %1, r4;" "}" : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3])); return reinterpret_cast(out); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < int N, FloatRoundStyle Round > struct NumericArrayConverter { static_assert(!(N % 4), "N must be multiple of 4."); using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericArrayConverter convert_vector_; result_type result; Array *result_ptr = reinterpret_cast *>(&result); Array const *source_ptr = reinterpret_cast const *>(&source); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N / 4; ++i) { result_ptr[i] = convert_vector_(source_ptr[i]); } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericConverter convert_element_; result_type result; result[0] = convert_element_(source[0]); return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { uint32_t tmp; asm volatile( "cvt.pack.sat.u8.s32.b32 %0, %2, %1, 0;\n" : "=r"(tmp) : "r"(source[0]), "r"(source[1])); uint16_t out = (tmp & 0xffff); return reinterpret_cast(out); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { unsigned out; asm volatile( "{ .reg .u32 r4;" "cvt.pack.sat.u8.s32.b32 r4, %4, %3, 0;" "cvt.pack.sat.u8.s32.b32 %0, %2, %1, r4;" "}" : "=r"(out) : "r"(source[0]), "r"(source[1]), "r"(source[2]), "r"(source[3])); return reinterpret_cast(out); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < int N, FloatRoundStyle Round > struct NumericArrayConverter { static_assert(!(N % 4), "N must be multiple of 4."); using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { NumericArrayConverter convert_vector_; result_type result; Array *result_ptr = reinterpret_cast *>(&result); Array const *source_ptr = reinterpret_cast const *>(&source); CUTLASS_PRAGMA_UNROLL for (int i = 0; i < N / 4; ++i) { result_ptr[i] = convert_vector_(source_ptr[i]); } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; #endif ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = float; using source_element = cutlass::float_e4m3_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out_fp16; uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \ "}\n" : "=r"(out_fp16): "h"(src_packed)); float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16)); result_type out; out[0] = res0.x; out[1] = res0.y; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e4m3_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;\n" \ "}" \ : "=h"(out) : "f"(source[0]), "f"(source[1])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = float; using source_element = cutlass::float_e5m2_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out_fp16; uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \ "}\n" : "=r"(out_fp16): "h"(src_packed)); float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16)); result_type out; out[0] = res0.x; out[1] = res0.y; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e5m2_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;\n" \ "}" \ : "=h"(out) : "f"(source[0]), "f"(source[1])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::half_t; using source_element = cutlass::float_e4m3_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) result_type out; uint32_t& reg = reinterpret_cast(out); uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \ "}\n" : "=r"(reg): "h"(src_packed)); return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e4m3_t; using source_element = cutlass::half_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;\n" \ "}" \ : "=h"(out) : "r"(reinterpret_cast(source))); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::half_t; using source_element = cutlass::float_e5m2_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) result_type out; uint32_t& reg = reinterpret_cast(out); uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \ "}\n" : "=r"(reg): "h"(src_packed)); return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e5m2_t; using source_element = cutlass::half_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;\n" \ "}" \ : "=h"(out) : "r"(reinterpret_cast(source))); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::bfloat16_t; using source_element = cutlass::float_e4m3_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t res_half; uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e4m3x2 %0, %1;\n" \ "}\n" : "=r"(res_half): "h"(src_packed)); float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half)); NumericArrayConverter converter; return converter(reinterpret_cast const&>(res_float)); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e4m3_t; using source_element = cutlass::bfloat16_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) NumericArrayConverter converter; Array res_float = converter(source); uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;\n" \ "}" \ : "=h"(out) : "f"(res_float[0]), "f"(res_float[1])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::bfloat16_t; using source_element = cutlass::float_e5m2_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t res_half; uint16_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ "cvt.rn.f16x2.e5m2x2 %0, %1;\n" \ "}\n" : "=r"(res_half): "h"(src_packed)); float2 res_float = __half22float2(reinterpret_cast<__half2 &>(res_half)); NumericArrayConverter converter; return converter(reinterpret_cast const&>(res_float)); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverter { using result_element = cutlass::float_e5m2_t; using source_element = cutlass::bfloat16_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) NumericArrayConverter converter; Array res_float = converter(source); uint16_t out; asm volatile( \ "{\n" \ "cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;\n" \ "}" \ : "=h"(out) : "f"(res_float[0]), "f"(res_float[1])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 2; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; namespace detail { /// Special converters that can be used with 4 8-bit elements packed in a register. /// Common use is for fast FP8 converters. template < typename T, typename S, FloatRoundStyle Round = FloatRoundStyle::round_to_nearest, typename Transform = cutlass::transform::thread::UnaryTransform::Identity > struct NumericArrayConverterPacked4Element { using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; static_assert(platform::is_same::value || platform::is_same::value, "Unary Operator not supported."); CUTLASS_HOST_DEVICE static result_type convert(source_type const & s) { result_type result; NumericConverter convert_; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { if (platform::is_same::value) { result[i] = convert_(s[i]); } else { // conjugate result[i] = conj(convert_(s[i])); } } return result; } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float; using source_element = cutlass::float_e4m3_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out_fp16[2]; uint32_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ ".reg .b16 lo, hi;\n" \ "mov.b32 {lo, hi}, %2;\n" \ "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \ "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \ "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed)); float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0])); float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1])); result_type out; out[0] = res0.x; out[1] = res0.y; out[2] = res1.x; out[3] = res1.y; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = cutlass::float_e4m3_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rn.satfinite.e4m3x2.f32 lo, %2, %1;\n" \ "cvt.rn.satfinite.e4m3x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float; using source_element = float_ue4m3_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out_fp16[2]; uint32_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ ".reg .b16 lo, hi;\n" \ "mov.b32 {lo, hi}, %2;\n" \ "cvt.rn.f16x2.e4m3x2 %0, lo;\n" \ "cvt.rn.f16x2.e4m3x2 %1, hi;\n" \ "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed)); float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0])); float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1])); result_type out; out[0] = res0.x; out[1] = res0.y; out[2] = res1.x; out[3] = res1.y; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float_ue4m3_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP8_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rn.satfinite.e4m3x2.f32 lo, %2, %1;\n" \ "cvt.rn.satfinite.e4m3x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float; using source_element = float_ue8m0_t; using result_type = Array; using source_type = Array; using BfloatArr = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_UE8M0_CVT_ENABLED) uint32_t out_fp16[2]; uint32_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ ".reg .b16 lo, hi;\n" \ "mov.b32 {lo, hi}, %2;\n" \ "cvt.rn.bf16x2.ue8m0x2 %0, lo;\n" \ "cvt.rn.bf16x2.ue8m0x2 %1, hi;\n" \ "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed)); NumericArrayConverter bf2fp32_converter; auto res0 = bf2fp32_converter(reinterpret_cast &>(out_fp16[0])); auto res1 = bf2fp32_converter(reinterpret_cast &>(out_fp16[1])); result_type out; out[0] = res0[0]; out[1] = res0[1]; out[2] = res1[0]; out[3] = res1[1]; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template <> struct NumericArrayConverterPacked4Element { using result_element = float_ue8m0_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_infinity; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_UE8M0_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rp.satfinite.ue8m0x2.f32 lo, %2, %1;\n" \ "cvt.rp.satfinite.ue8m0x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template <> struct NumericArrayConverterPacked4Element { using result_element = float_ue8m0_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_UE8M0_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rz.satfinite.ue8m0x2.f32 lo, %2, %1;\n" \ "cvt.rz.satfinite.ue8m0x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float_ue8m0_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_infinity; CUTLASS_HOST_DEVICE static result_type convert(source_type const & source) { //default maps to RP mode. return NumericArrayConverterPacked4Element{}(source); } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = cutlass::detail::float_e2m3_unpack8bits_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rn.satfinite.e2m3x2.f32 lo, %2, %1;\n" \ "cvt.rn.satfinite.e2m3x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast(out); #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = float; using source_element = cutlass::detail::float_e2m3_unpack8bits_t; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED) uint32_t out_fp16[2]; uint32_t const& src_packed = reinterpret_cast(source); asm volatile( \ "{\n" \ ".reg .b16 lo, hi;\n" \ "mov.b32 {lo, hi}, %2;\n" \ "cvt.rn.f16x2.e2m3x2 %0, lo;\n" \ "cvt.rn.f16x2.e2m3x2 %1, hi;\n" \ "}\n" : "=r"(out_fp16[0]), "=r"(out_fp16[1]) : "r"(src_packed)); float2 res0 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[0])); float2 res1 = __half22float2(reinterpret_cast<__half2 &>(out_fp16[1])); result_type out; out[0] = res0.x; out[1] = res0.y; out[2] = res1.x; out[3] = res1.y; return out; #else result_type result; NumericConverter converter; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < 4; ++i) { result[i] = converter(source[i]); } return result; #endif } CUTLASS_HOST_DEVICE result_type operator()(source_type const &s) const { return convert(s); } }; ///////////////////////////////////////////////////////////////////////////////////////////////// // // Partial specializations for Array <=> Array // ///////////////////////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Array <= Array template < FloatRoundStyle Round > struct NumericArrayConverterPacked4Element { using result_element = cutlass::detail::float_e3m2_unpack8bits_t; using source_element = float; using result_type = Array; using source_type = Array; static FloatRoundStyle const round_style = Round; CUTLASS_DEVICE static result_type convert(source_type const & source) { #if defined(CUDA_PTX_FP4FP6_CVT_ENABLED) uint32_t out; asm volatile( \ "{\n" \ ".reg .b16 lo;\n" \ ".reg .b16 hi;\n" \ "cvt.rn.satfinite.e3m2x2.f32 lo, %2, %1;\n" \ "cvt.rn.satfinite.e3m2x2.f32 hi, %4, %3;\n" \ "mov.b32 %0, {lo, hi};\n" \ "}" \ : "=r"(out) : "f"(source[0]), "f"(source[1]), "f"(source[2]), "f"(source[3])); return reinterpret_cast