Added HiFi optimized mean and where ops.#6483

Merged

mcremon-meta merged 2 commits intopytorch:mainfrom

Nov 7, 2024

Contributor

cad-audio commented Oct 24, 2024

No description provided.

dijopaul and others added 2 commits

October 23, 2024 06:51


          Adding mean and where ops optimized on HiFi

216389c


          Merge pull request #14 from dijopaul/main

3d849bb

Adding mean and where ops optimized on HiFi

pytorch-bot bot commented Oct 24, 2024

🔗 Helpful Links

🧪 See artifacts and rendered test results at hud.pytorch.org/pr/pytorch/executorch/6483

📄 Preview Python docs built from this PR

Note: Links to docs will display an error until the docs builds have been completed.

❗ 1 Active SEVs

There are 1 currently active SEVs. If your PR is affected, please view them below:

[PRE-EMPTIVE] Experimenting with new runners linux.aws.a100 on inductor-perf-compare.yml

This comment was automatically generated by Dr. CI and updates every 15 minutes.

facebook-github-bot added the CLA Signed label

hsharma35 reviewed

View reviewed changes

backends/cadence/hifi/operators/op_mean.cpp

+              int prepare_data(
+                  const Tensor& in,
+                  Tensor& out,

Contributor

hsharma35 Oct 24, 2024

const Tensor& out

hsharma35 approved these changes

View reviewed changes

Contributor

hsharma35 left a comment

LGTM overall. Added some minor comments for readability.

backends/cadence/hifi/operators/op_mean.cpp

+                constexpr auto name = "mean.out";
+                constexpr int kNnlibMaxDim = 4;
+                bool optimized = 1;

Contributor

hsharma35 Oct 24, 2024

Nit: Please use true or false instead of 0 or 1

backends/cadence/hifi/operators/op_mean.cpp

Comment on lines +37 to +43

+                for (int i = 0; i < num_inp_dims; i++) {
+                  inp_shape[i] = in.size(i);
+                }
+                for (int i = 0; i < num_out_dims; i++) {
+                  out_shape[i] = out.size(i);
+                }

Contributor

hsharma35 Oct 24, 2024

Nit: can we make this a helper function that accepts a const Tensor& and mutable int*

backends/cadence/hifi/operators/op_mean.cpp

+                  optional<ArrayRef<int64_t>> dim_list,
+                  int* inp_shape,
+                  int* out_shape,
+                  int* p_axis,

Contributor

hsharma35 Oct 24, 2024

Nit: What is p_axis used for? Can we please rename this to make the usage obvious (or add comment)?

backends/cadence/hifi/operators/op_mean.cpp

+                  int scratch_size = xa_nn_reduce_getsize_nhwc(
+                      -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1);
+                  void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size);

Contributor

hsharma35 Oct 24, 2024

Let's use temporary allocator instead of malloc please.

backends/cadence/hifi/operators/op_where.cpp

+                int a_dim = a.dim(), b_dim = b.dim(), con_dim = cond.dim(),
+                    out_dim = out.dim();
+                bool optimized = 1;

Contributor

hsharma35 Oct 24, 2024

Nit: use true or false instead of integers.

backends/cadence/hifi/operators/op_where.cpp

+                      for (int i = 0; i < 4; i++) {
+                        con_shape[i] = out_shape[i];
+                      }
+                      xa_nn_elm_where_broadcast_4D_f32xf32_f32(

Contributor

hsharma35 Oct 24, 2024

ret = nnlib(...)
ET_KERNEL_CHECK(
ctx,
ret == 0,
ERROR_CODE,
out);

backends/cadence/hifi/operators/op_where.cpp

+                          con_shape);
+                      free(p_scratch);
+                    } else {
+                      xa_nn_elm_where_broadcast_4D_f32xf32_f32(

Contributor

hsharma35 Oct 24, 2024

ret = nnlib(...)
ET_KERNEL_CHECK(
ctx,
ret == 0,
ERROR_CODE,
out);

backends/cadence/hifi/operators/op_where.cpp

+                          con_shape);
+                    }
+                  } else {
+                    xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel());

Contributor

hsharma35 Oct 24, 2024

ret = nnlib(...)
ET_KERNEL_CHECK(
ctx,
ret == 0,
ERROR_CODE,
out);

backends/cadence/hifi/operators/op_where.cpp

+                    "Unhandled dtype %s for where.self_out",
+                    torch::executor::toString(cond_type));
+                int a_dim = a.dim(), b_dim = b.dim(), con_dim = cond.dim(),

Contributor

hsharma35 Oct 24, 2024

Nit: rename con_dim -> cond_dim

backends/cadence/hifi/operators/op_where.cpp

Comment on lines +77 to +150

+                if (optimized) {
+                  const float* a_data = a.const_data_ptr<float>();
+                  const float* b_data = b.const_data_ptr<float>();
+                  float* out_data = out.mutable_data_ptr<float>();
+                  const unsigned char* con = cond.const_data_ptr<uint8_t>();
+                  if (broadcast == 1) {
+                    int out_shape[kNnlibMaxDim];
+                    int inp1_shape[kNnlibMaxDim];
+                    int inp2_shape[kNnlibMaxDim];
+                    int con_shape[kNnlibMaxDim];
+                    for (int i = 0; i < kNnlibMaxDim; i++) {
+                      con_shape[i] = 1;
+                      out_shape[i] = 1;
+                      inp1_shape[i] = 1;
+                      inp2_shape[i] = 1;
+                    }
+                    int off_o = kNnlibMaxDim - out.dim();
+                    int off_a = kNnlibMaxDim - a.dim();
+                    int off_b = kNnlibMaxDim - b.dim();
+                    int off_c = kNnlibMaxDim - cond.dim();
+                    for (int i = 0; i < out.dim(); i++)
+                      out_shape[i + off_o] = out.size(i);
+                    for (int i = 0; i < a.dim(); i++)
+                      inp1_shape[i + off_a] = a.size(i);
+                    for (int i = 0; i < b.dim(); i++)
+                      inp2_shape[i + off_b] = b.size(i);
+                    for (int i = 0; i < cond.dim(); i++)
+                      con_shape[i + off_c] = cond.size(i);
+                    if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] ||
+                        con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) {
+                      void* p_scratch =
+                          malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]);
+                      const unsigned char* p_brd_cond = (const unsigned char*)p_scratch;
+                      xa_nn_broadcast_8_8(
+                          (WORD8* __restrict__)p_brd_cond,
+                          out_shape,
+                          (const WORD8* __restrict__)con,
+                          con_shape,
+);
+                      for (int i = 0; i < 4; i++) {
+                        con_shape[i] = out_shape[i];
+                      }
+                      xa_nn_elm_where_broadcast_4D_f32xf32_f32(
+                          out_data,
+                          out_shape,
+                          a_data,
+                          inp1_shape,
+                          b_data,
+                          inp2_shape,
+                          p_brd_cond,
+                          con_shape);
+                      free(p_scratch);
+                    } else {
+                      xa_nn_elm_where_broadcast_4D_f32xf32_f32(
+                          out_data,
+                          out_shape,
+                          a_data,
+                          inp1_shape,
+                          b_data,
+                          inp2_shape,
+                          con,
+                          con_shape);
+                    }
+                  } else {
+                    xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel());
+                  }
+                  return out;
+                }

Contributor

hsharma35 Oct 24, 2024

Can we move this to a separate inline function?

mcremon-meta merged commit 38346fd into pytorch:main

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels