-
Notifications
You must be signed in to change notification settings - Fork 902
Added HiFi optimized mean and where ops. #6483
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,170 @@ | ||
| /* | ||
| * Copyright (c) Meta Platforms, Inc. and affiliates. | ||
| * All rights reserved. | ||
| * | ||
| * This source code is licensed under the BSD-style license found in the | ||
| * LICENSE file in the root directory of this source tree. | ||
| */ | ||
|
|
||
| #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h> | ||
| #include <executorch/kernels/portable/cpu/util/reduce_util.h> | ||
| #include <executorch/runtime/kernel/kernel_includes.h> | ||
| #include <executorch/runtime/platform/assert.h> | ||
|
|
||
| #include <executorch/backends/cadence/hifi/kernels/kernels.h> | ||
|
|
||
| using exec_aten::ScalarType; | ||
| using exec_aten::Tensor; | ||
| using executorch::aten::RuntimeContext; | ||
| using executorch::runtime::ArrayRef; | ||
| using torch::executor::Error; | ||
| using torch::executor::optional; | ||
|
|
||
| namespace cadence { | ||
| namespace impl { | ||
| namespace HiFi { | ||
| namespace native { | ||
|
|
||
| int prepare_data( | ||
| const Tensor& in, | ||
| Tensor& out, | ||
| optional<ArrayRef<int64_t>> dim_list, | ||
| int* inp_shape, | ||
| int* out_shape, | ||
| int* p_axis, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: What is |
||
| int num_inp_dims, | ||
| int num_out_dims) { | ||
| for (int i = 0; i < num_inp_dims; i++) { | ||
| inp_shape[i] = in.size(i); | ||
| } | ||
|
|
||
| for (int i = 0; i < num_out_dims; i++) { | ||
| out_shape[i] = out.size(i); | ||
| } | ||
|
Comment on lines
+37
to
+43
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: can we make this a helper function that accepts a const Tensor& and mutable int* |
||
|
|
||
| int num_axis_dims = 0; | ||
| for (const auto& d : dim_list.value()) { | ||
| if (d < 0) { | ||
| p_axis[num_axis_dims] = num_inp_dims + d; | ||
| num_axis_dims++; | ||
| } else { | ||
| p_axis[num_axis_dims] = d; | ||
| num_axis_dims++; | ||
| } | ||
| } | ||
|
|
||
| return num_axis_dims; | ||
| } | ||
|
|
||
| Tensor& mean_dim_out( | ||
| RuntimeContext& ctx, | ||
| const Tensor& in, | ||
| optional<ArrayRef<int64_t>> dim_list, | ||
| bool keepdim, | ||
| optional<ScalarType> dtype, | ||
| Tensor& out) { | ||
| ET_KERNEL_CHECK( | ||
| ctx, | ||
| torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out), | ||
| InvalidArgument, | ||
| out); | ||
|
|
||
| ET_KERNEL_CHECK( | ||
| ctx, | ||
| torch::executor::resize_reduction_out(in, dim_list, keepdim, out) == | ||
| Error::Ok, | ||
| InvalidArgument, | ||
| out); | ||
|
|
||
| constexpr auto name = "mean.out"; | ||
| constexpr int kNnlibMaxDim = 4; | ||
|
|
||
| bool optimized = 1; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Please use |
||
|
|
||
| if (out.scalar_type() != ScalarType::Float) | ||
| optimized = 0; | ||
|
|
||
| if (in.dim() > kNnlibMaxDim) | ||
| optimized = 0; | ||
|
|
||
| if (optimized) { | ||
| float* __restrict__ p_out = out.mutable_data_ptr<float>(); | ||
| const float* __restrict__ p_inp = | ||
| (const float* __restrict__)in.const_data_ptr<float>(); | ||
|
|
||
| int num_elm = in.numel(); | ||
|
|
||
| int num_inp_dims = in.dim(); | ||
| int num_out_dims = out.dim(); | ||
|
|
||
| int inp_shape[kNnlibMaxDim]; | ||
| int out_shape[kNnlibMaxDim]; | ||
| int p_axis[kNnlibMaxDim]; | ||
|
|
||
| for (int i = 0; i < kNnlibMaxDim; i++) { | ||
| out_shape[i] = 1; | ||
| inp_shape[i] = 1; | ||
| p_axis[i] = 1; | ||
| } | ||
|
|
||
| int num_axis_dims = prepare_data( | ||
| in, | ||
| out, | ||
| dim_list, | ||
| inp_shape, | ||
| out_shape, | ||
| p_axis, | ||
| num_inp_dims, | ||
| num_out_dims); | ||
|
|
||
| if (num_axis_dims == num_inp_dims) { | ||
| num_out_dims = 1; | ||
| out_shape[0] = 1; | ||
| } | ||
|
|
||
| int scratch_size = xa_nn_reduce_getsize_nhwc( | ||
| -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1); | ||
|
|
||
| void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's use temporary allocator instead of malloc please. |
||
|
|
||
| xa_nn_reduce_mean_4D_f32_f32( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add a check for the output of nnlib api. |
||
| p_out, | ||
| out_shape, | ||
| p_inp, | ||
| inp_shape, | ||
| p_axis, | ||
| num_out_dims, | ||
| num_inp_dims, | ||
| num_axis_dims, | ||
| p_scratch_in); | ||
|
|
||
| return out; | ||
| } | ||
|
Comment on lines
+90
to
+142
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we move this to a separate inline function? (like hifi_optimized_mean) |
||
|
|
||
| ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { | ||
| ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { | ||
| CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>(); | ||
| const size_t num = torch::executor::get_reduced_dim_product(in, dim_list); | ||
|
|
||
| for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { | ||
| CTYPE_OUT sum = 0; | ||
| if (in.numel() > 0) { | ||
| sum = torch::executor::map_reduce_over_dim_list<CTYPE_IN, CTYPE_OUT>( | ||
| [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, | ||
| [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, | ||
| in, | ||
| dim_list, | ||
| out_ix); | ||
| } | ||
| out_data[out_ix] = sum / static_cast<float>(num); | ||
| } | ||
| }); | ||
| }); | ||
|
|
||
| return out; | ||
| } | ||
|
|
||
| } // namespace native | ||
| } // namespace HiFi | ||
| } // namespace impl | ||
| } // namespace cadence | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
const Tensor& out