From f1ace777b56cba104026b59dad0d1616e8caab0f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Feb 2025 17:16:10 -0800
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_argmax.cpp |  5 ++++-
 kernels/portable/cpu/op_argmin.cpp | 12 +++++++++++-
 kernels/test/op_argmax_test.cpp    | 13 +++++++++++++
 kernels/test/op_argmin_test.cpp    | 13 +++++++++++++
 4 files changed, 41 insertions(+), 2 deletions(-)
diff --git a/kernels/portable/cpu/op_argmax.cpp b/kernels/portable/cpu/op_argmax.cpp
index 5eb656d5b76..7e95c305e89 100644
--- a/kernels/portable/cpu/op_argmax.cpp
+++ b/kernels/portable/cpu/op_argmax.cpp
@@ -49,7 +49,10 @@ Tensor& argmax_out(
     for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v > acc_val)) {
+            // the below condition as written is equivalent to
+            // !isnan(accval) && (isnan(v) || v > acc_val). See
+            // argument in op_argmin.cpp.
+            if (!std::isnan(acc_val) && !(v <= acc_val)) {
               acc_val = v;
               acc_ix = ix;
             }
diff --git a/kernels/portable/cpu/op_argmin.cpp b/kernels/portable/cpu/op_argmin.cpp
index 1c4a2572ea8..8a94994c4be 100644
--- a/kernels/portable/cpu/op_argmin.cpp
+++ b/kernels/portable/cpu/op_argmin.cpp
@@ -49,7 +49,17 @@ Tensor& argmin_out(
     for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) {
       std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
           [](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
-            if (!std::isnan(acc_val) && (std::isnan(v) || v < acc_val)) {
+            // the below condition as written is equivalent to !isnan(accval) &&
+            // (isnan(v) || v < acc_val). cases:
+            // - if neither acc_val nor v is NaN, !(v >= acc_val) is
+            //   trivially equivalent to v < acc_val.
+            // - if acc_val is NaN, the whole thing is trivially false.
+            // - if acc_val is not NaN and v is NaN, then v >= acc_val
+            // - is false because all comparisons involving NaN are
+            // - false, so the result is true. The result is trivially
+            // - true for the above condition that uses isnan(v) as
+            // - well.
+            if (!std::isnan(acc_val) && !(v >= acc_val)) {
               acc_val = v;
               acc_ix = ix;
             }
diff --git a/kernels/test/op_argmax_test.cpp b/kernels/test/op_argmax_test.cpp
index 66c79cefff7..4d68dfe88be 100644
--- a/kernels/test/op_argmax_test.cpp
+++ b/kernels/test/op_argmax_test.cpp
@@ -90,3 +90,16 @@ TEST_F(OpArgmaxTest, SanityCheckNullDim) {
   EXPECT_TENSOR_EQ(out, expected);
   // clang-format on
 }
+
+TEST_F(OpArgmaxTest, FirstNaNWins) {
+  TensorFactory<ScalarType::Float> tf_float;
+  Tensor in = tf_float.make({4}, {1, NAN, -4, NAN});
+
+  TensorFactory<ScalarType::Long> tf_long;
+  Tensor out = tf_long.zeros({});
+  Tensor expected = tf_long.make({}, {1});
+
+  Tensor ret = op_argmax_out(in, {}, false, out);
+  EXPECT_TENSOR_EQ(out, ret);
+  EXPECT_TENSOR_EQ(out, expected);
+}
diff --git a/kernels/test/op_argmin_test.cpp b/kernels/test/op_argmin_test.cpp
index 250fe4f7e1e..a0b2699a28f 100644
--- a/kernels/test/op_argmin_test.cpp
+++ b/kernels/test/op_argmin_test.cpp
@@ -90,3 +90,16 @@ TEST_F(OpArgminTest, SanityCheckNullDim) {
   EXPECT_TENSOR_EQ(out, expected);
   // clang-format on
 }
+
+TEST_F(OpArgminTest, FirstNaNWins) {
+  TensorFactory<ScalarType::Float> tf_float;
+  Tensor in = tf_float.make({4}, {1, NAN, -4, NAN});
+
+  TensorFactory<ScalarType::Long> tf_long;
+  Tensor out = tf_long.zeros({});
+  Tensor expected = tf_long.make({}, {1});
+
+  Tensor ret = op_argmin_out(in, {}, false, out);
+  EXPECT_TENSOR_EQ(out, ret);
+  EXPECT_TENSOR_EQ(out, expected);
+}

From d3a0f675767cf527ac680f32bbaa0b0fe97e8804 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Feb 2025 17:16:14 -0800
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 .../cpu/util/delinearized_indexes_range.h     | 108 ++++++++++++++++++
 kernels/portable/cpu/util/targets.bzl         |  13 +++
 kernels/portable/cpu/util/test/CMakeLists.txt |   4 +-
 .../test/delinearized_indexes_range_test.cpp  |  69 +++++++++++
 kernels/portable/cpu/util/test/targets.bzl    |  10 ++
 test/utils/OSSTestConfig.json                 |   4 +-
 6 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 kernels/portable/cpu/util/delinearized_indexes_range.h
 create mode 100644 kernels/portable/cpu/util/test/delinearized_indexes_range_test.cpp

diff --git a/kernels/portable/cpu/util/delinearized_indexes_range.h b/kernels/portable/cpu/util/delinearized_indexes_range.h
new file mode 100644
index 00000000000..c6a9cc91bfb
--- /dev/null
+++ b/kernels/portable/cpu/util/delinearized_indexes_range.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iterator>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_dimension_limit.h>
+
+namespace torch::executor {
+
+namespace internal {
+class DelinearizedIndexesIterator {
+ public:
+  using difference_type = ssize_t;
+  using value_type = std::array<std::size_t, ::executorch::runtime::kTensorDimensionLimit>;
+  using reference = const value_type&;
+  using pointer = const value_type*;
+  using iterator_category = std::forward_iterator_tag;
+
+  DelinearizedIndexesIterator() = default;
+
+  explicit DelinearizedIndexesIterator(const Tensor& t)
+      : idx_(0), dim_(t.dim()), shape_(t.sizes()) {
+  }
+
+  struct make_end_t {
+    explicit constexpr make_end_t() = default;
+  };
+
+  DelinearizedIndexesIterator(make_end_t, const Tensor& t)
+      : idx_(t.numel()) {}
+
+  bool operator==(const DelinearizedIndexesIterator& rhs) const {
+    return idx_ == rhs.idx_;
+  }
+
+  bool operator!=(const DelinearizedIndexesIterator& rhs) const {
+    return !operator==(rhs);
+  }
+
+  reference operator*() const {
+    return repr_;
+  }
+
+  pointer operator->() const {
+    return &repr_;
+  }
+
+  DelinearizedIndexesIterator& operator++() {
+    idx_++;
+    for (auto ii = dim_ - 1; ii >= 0; --ii) {
+      repr_[ii]++;
+      ET_DCHECK(repr_[ii] <= shape_[ii]);
+      if ET_LIKELY (repr_[ii] < shape_[ii]) {
+        break;
+      } else {
+        repr_[ii] = 0;
+      }
+    }
+    return *this;
+  }
+
+  DelinearizedIndexesIterator operator++(int) {
+    auto it = *this;
+    operator++();
+    return it;
+  }
+
+  difference_type operator-(const DelinearizedIndexesIterator& rhs) const {
+    return difference_type(idx_ - rhs.idx_);
+  }
+
+ private:
+  std::size_t idx_ = 0;
+  value_type repr_ = {0,};
+  ssize_t dim_;
+  ArrayRef<exec_aten::SizesType> shape_;
+};
+} // namespace internal
+
+class DelinearizedIndexesRange {
+ public:
+  using iterator = internal::DelinearizedIndexesIterator;
+
+  DelinearizedIndexesRange(const Tensor& t) :
+      tensor_(t) {}
+
+  iterator begin() const {
+    return iterator(tensor_);
+  }
+
+  iterator end() {
+    return iterator(iterator::make_end_t(), tensor_);
+  }
+ private:
+  const Tensor& tensor_;
+};
+} // namespace torch::executor
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index eef765d5eec..95970b600ef 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -280,6 +280,19 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "delinearized_indexes_range",
+        exported_headers = ["delinearized_indexes_range.h"],
+        deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:tensor_dimension_limit",
+        ],
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     # Utility functions that can be used by operators that perform reduction
     for aten_mode in get_aten_mode_options():
         suffix = "_aten" if aten_mode else ""
diff --git a/kernels/portable/cpu/util/test/CMakeLists.txt b/kernels/portable/cpu/util/test/CMakeLists.txt
index 5f81e4b6aec..76c53ea8448 100644
--- a/kernels/portable/cpu/util/test/CMakeLists.txt
+++ b/kernels/portable/cpu/util/test/CMakeLists.txt
@@ -19,7 +19,9 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
 
 include(${EXECUTORCH_ROOT}/build/Test.cmake)
 
-set(_test_srcs broadcast_test.cpp reduce_test.cpp)
+set(_test_srcs broadcast_test.cpp delinearized_indexes_range_test.cpp
+               reduce_test.cpp
+)
 
 et_cxx_test(
   kernels_portable_cpu_util_test SOURCES ${_test_srcs} EXTRA_LIBS
diff --git a/kernels/portable/cpu/util/test/delinearized_indexes_range_test.cpp b/kernels/portable/cpu/util/test/delinearized_indexes_range_test.cpp
new file mode 100644
index 00000000000..395e1a74d98
--- /dev/null
+++ b/kernels/portable/cpu/util/test/delinearized_indexes_range_test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/util/delinearized_indexes_range.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+
+#include <gtest/gtest.h>
+
+using executorch::aten::ScalarType;
+using executorch::aten::Tensor;
+using executorch::runtime::testing::TensorFactory;
+using torch::executor::DelinearizedIndexesRange;
+
+TEST(DelinearizedIndexesRangeTest, Empty) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor a = tf.make({0}, {});
+  ASSERT_EQ(a.numel(), 0);
+  bool loop_entered = false;
+  for (auto _ : DelinearizedIndexesRange(a)) {
+    loop_entered = true;
+  }
+  EXPECT_FALSE(loop_entered);
+}
+
+TEST(DelinearizedIndexesRangeTest, OneD) {
+  TensorFactory<ScalarType::Int> tf;
+
+  Tensor a = tf.zeros({5});
+  DelinearizedIndexesRange r(a);
+  std::vector<typename DelinearizedIndexesRange::iterator::value_type> v(r.begin(), r.end());
+  int idx = 0;
+  for (const auto& elem: v) {
+    EXPECT_EQ(elem[0], idx++);
+  }
+}
+
+TEST(DelinearizedIndexesRangeTest, ThreeD) {
+  TensorFactory<ScalarType::Int> tf;
+  Tensor a = tf.zeros({3, 2, 3});
+  DelinearizedIndexesRange r(a);
+  std::vector<typename DelinearizedIndexesRange::iterator::value_type> v(r.begin(), r.end());
+  std::vector<typename DelinearizedIndexesRange::iterator::value_type> expected = {
+    {0, 0, 0},
+    {0, 0, 1},
+    {0, 0, 2},
+    {0, 1, 0},
+    {0, 1, 1},
+    {0, 1, 2},
+    {1, 0, 0},
+    {1, 0, 1},
+    {1, 0, 2},
+    {1, 1, 0},
+    {1, 1, 1},
+    {1, 1, 2},
+    {2, 0, 0},
+    {2, 0, 1},
+    {2, 0, 2},
+    {2, 1, 0},
+    {2, 1, 1},
+    {2, 1, 2},
+  };
+  EXPECT_EQ(v, expected);
+}
diff --git a/kernels/portable/cpu/util/test/targets.bzl b/kernels/portable/cpu/util/test/targets.bzl
index 28988b90dcc..25e16237361 100644
--- a/kernels/portable/cpu/util/test/targets.bzl
+++ b/kernels/portable/cpu/util/test/targets.bzl
@@ -12,6 +12,16 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_test(
+        name = "delinearized_indexes_range_test",
+        srcs = ["delinearized_indexes_range_test.cpp"],
+        deps = [
+            "//executorch/kernels/portable/cpu/util:delinearized_indexes_range",
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
+        ],
+    )
+
     runtime.cxx_test(
         name = "reduce_test",
         srcs = ["reduce_test.cpp"],
diff --git a/test/utils/OSSTestConfig.json b/test/utils/OSSTestConfig.json
index 70cb2d2e44f..b94f11ea94e 100644
--- a/test/utils/OSSTestConfig.json
+++ b/test/utils/OSSTestConfig.json
@@ -7,7 +7,8 @@
             "op_fast_hadamard_transform_test.cpp"
         ],
         "additional_libs": [
-            "custom_ops"
+            "custom_ops",
+            "dumb_fht"
         ]
     },
     {
@@ -62,6 +63,7 @@
         "directory": "kernels/portable/cpu/util/test",
         "sources": [
             "broadcast_test.cpp",
+            "delinearized_indexes_range_test.cpp",
             "reduce_test.cpp"
         ],
         "additional_libs": [

From 14a55be16b0f20ecb614aa51b1333d42f56f8e80 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 4 Mar 2025 10:27:18 -0800
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/broadcast_indexes_range.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index bebf5a056e6..7ee4f3fb2b1 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -158,7 +158,8 @@ class BroadcastIndexesIterator {
   // output_dim. This is straightforwardly implementable with an
   // adjusted stride array that contains 0s where the padded input
   // shape would contain 1s.
-  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {{0}};
+  std::array<ShapeType, kNumInputs> effective_input_broadcast_strides_ = {
+      {{0}}};
 };
 } // namespace internal