#pragma once

// @generated by torchgen/gen.py from NativeFunction.h

#include <c10/core/Scalar.h>
#include <c10/core/Storage.h>
#include <c10/core/TensorOptions.h>
#include <c10/util/Deprecated.h>
#include <c10/util/Optional.h>
#include <c10/core/QScheme.h>
#include <ATen/core/Reduction.h>
#include <ATen/core/Tensor.h>
#include <tuple>
#include <vector>


namespace at {
namespace native {
TORCH_API ::std::tuple<at::Tensor,at::Tensor> _scaled_mm_cuda(const at::Tensor & self, const at::Tensor & mat2, const ::std::optional<at::Tensor> & bias={}, ::std::optional<at::ScalarType> out_dtype=::std::nullopt, const ::std::optional<at::Tensor> & scale_a={}, const ::std::optional<at::Tensor> & scale_b={}, const ::std::optional<at::Tensor> & scale_result={}, bool use_fast_accum=false);
TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> _scaled_mm_out_cuda(const at::Tensor & self, const at::Tensor & mat2, const ::std::optional<at::Tensor> & bias, ::std::optional<at::ScalarType> out_dtype, const ::std::optional<at::Tensor> & scale_a, const ::std::optional<at::Tensor> & scale_b, const ::std::optional<at::Tensor> & scale_result, bool use_fast_accum, at::Tensor & out, at::Tensor & out_amax);
} // namespace native
} // namespace at
