Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@
- arg_meta: null
kernel_name: torch::executor::full_out

- op: mean.out
kernels:
- arg_meta: null
kernel_name: cadence::impl::HiFi::mean_dim_out

- op: mul.out
kernels:
- arg_meta: null
Expand Down Expand Up @@ -105,7 +110,7 @@
- op: where.self_out
kernels:
- arg_meta: null
kernel_name: torch::executor::where_out
kernel_name: cadence::impl::HiFi::where_out

# custom ops
- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
Expand Down
2 changes: 2 additions & 0 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ add_library(
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
Expand Down
28 changes: 28 additions & 0 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
const unsigned char* __restrict__ p_condition,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
const unsigned char* __restrict__ p_condition,
const WORD32* const p_condition_shape);

extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_axis,
WORD32 num_out_dims,
WORD32 num_inp_dims,
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

namespace cadence {
namespace impl {
namespace HiFi {
Expand Down
12 changes: 3 additions & 9 deletions backends/cadence/hifi/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,12 @@ endif()
set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
"${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp"
Expand All @@ -57,6 +50,7 @@ set(_aten_ops__srcs
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
)
add_library(aten_ops_cadence ${_aten_ops__srcs})
target_link_libraries(aten_ops_cadence PUBLIC executorch)
Expand Down
170 changes: 170 additions & 0 deletions backends/cadence/hifi/operators/op_mean.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/platform/assert.h>

#include <executorch/backends/cadence/hifi/kernels/kernels.h>

using exec_aten::ScalarType;
using exec_aten::Tensor;
using executorch::aten::RuntimeContext;
using executorch::runtime::ArrayRef;
using torch::executor::Error;
using torch::executor::optional;

namespace cadence {
namespace impl {
namespace HiFi {
namespace native {

int prepare_data(
const Tensor& in,
Tensor& out,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const Tensor& out

optional<ArrayRef<int64_t>> dim_list,
int* inp_shape,
int* out_shape,
int* p_axis,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: What is p_axis used for? Can we please rename this to make the usage obvious (or add comment)?

int num_inp_dims,
int num_out_dims) {
for (int i = 0; i < num_inp_dims; i++) {
inp_shape[i] = in.size(i);
}

for (int i = 0; i < num_out_dims; i++) {
out_shape[i] = out.size(i);
}
Comment on lines +37 to +43
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: can we make this a helper function that accepts a const Tensor& and mutable int*


int num_axis_dims = 0;
for (const auto& d : dim_list.value()) {
if (d < 0) {
p_axis[num_axis_dims] = num_inp_dims + d;
num_axis_dims++;
} else {
p_axis[num_axis_dims] = d;
num_axis_dims++;
}
}

return num_axis_dims;
}

Tensor& mean_dim_out(
RuntimeContext& ctx,
const Tensor& in,
optional<ArrayRef<int64_t>> dim_list,
bool keepdim,
optional<ScalarType> dtype,
Tensor& out) {
ET_KERNEL_CHECK(
ctx,
torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out),
InvalidArgument,
out);

ET_KERNEL_CHECK(
ctx,
torch::executor::resize_reduction_out(in, dim_list, keepdim, out) ==
Error::Ok,
InvalidArgument,
out);

constexpr auto name = "mean.out";
constexpr int kNnlibMaxDim = 4;

bool optimized = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Please use true or false instead of 0 or 1


if (out.scalar_type() != ScalarType::Float)
optimized = 0;

if (in.dim() > kNnlibMaxDim)
optimized = 0;

if (optimized) {
float* __restrict__ p_out = out.mutable_data_ptr<float>();
const float* __restrict__ p_inp =
(const float* __restrict__)in.const_data_ptr<float>();

int num_elm = in.numel();

int num_inp_dims = in.dim();
int num_out_dims = out.dim();

int inp_shape[kNnlibMaxDim];
int out_shape[kNnlibMaxDim];
int p_axis[kNnlibMaxDim];

for (int i = 0; i < kNnlibMaxDim; i++) {
out_shape[i] = 1;
inp_shape[i] = 1;
p_axis[i] = 1;
}

int num_axis_dims = prepare_data(
in,
out,
dim_list,
inp_shape,
out_shape,
p_axis,
num_inp_dims,
num_out_dims);

if (num_axis_dims == num_inp_dims) {
num_out_dims = 1;
out_shape[0] = 1;
}

int scratch_size = xa_nn_reduce_getsize_nhwc(
-3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);

void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use temporary allocator instead of malloc please.


xa_nn_reduce_mean_4D_f32_f32(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add a check for the output of nnlib api.

ret = xa_nn_reduce_mean_4D_f32_f32(...)
ET_KERNEL_CHECK(
      ctx,
      ret == 0,
      ERROR_CODE,
      out);

p_out,
out_shape,
p_inp,
inp_shape,
p_axis,
num_out_dims,
num_inp_dims,
num_axis_dims,
p_scratch_in);

return out;
}
Comment on lines +90 to +142
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we move this to a separate inline function? (like hifi_optimized_mean)


ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
const size_t num = torch::executor::get_reduced_dim_product(in, dim_list);

for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
CTYPE_OUT sum = 0;
if (in.numel() > 0) {
sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>(
[](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
[](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
in,
dim_list,
out_ix);
}
out_data[out_ix] = sum / static_cast<float>(num);
}
});
});

return out;
}

} // namespace native
} // namespace HiFi
} // namespace impl
} // namespace cadence
Loading