Remove extern "C" wrapping and fix format specifiers for ARM embedded builds (pytorch#18000)

Ninja91 · meta-codesync[bot] · commit cf1ae06519d8 · 2026-03-09T20:49:03.000-07:00
Summary: Pull Request resolved: pytorch#18000 Remove redundant `extern "C"` blocks wrapping CMSIS-NN header includes and fix `-Wformat` errors in format specifiers. These changes are required for the CC pipeline's FVP benchmark runner to compile on ARM embedded targets (Cortex-M55/M85 with MVE). ## Context The `extern "C"` wrapping around CMSIS-NN headers causes ARM embedded builds targeting MVE-capable processors to fail. CMSIS-NN's `arm_nn_math_types.h` temporarily closes its inner `extern "C"` before including `arm_mve.h`, but the outer `extern "C"` from the op files remains active, forcing `arm_mve.h` into C linkage where C++ function overloading is illegal. Additionally, format specifiers (`%hhd` for `ScalarType`, `%d`/`%ld` for `int64_t`) cause `-Wformat` errors treated as build failures on ARM toolchains. Changes: 1. Removed `extern "C"` wrapping from all 13 op .cpp files and `cmsis_scratch_buffer_context.h` 2. Consolidated CMSIS-NN includes in `cortex_m_ops_common.h` — added `#include "arm_nnfunctions.h"` (without `extern "C"`) so op files get it transitively 3. Added `#include <cinttypes>` for `PRIi64` macro 4. Fixed `%hhd` → `%d` with `static_cast<int>` for `ScalarType` values 5. Fixed `%d`/`%ld` → `PRIi64` for `int64_t` values Reviewed By: rascani Differential Revision: D95739935
diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h
@@ -7,10 +7,8 @@
  */
 #pragma once
 
-#include "cortex_m_ops_common.h"
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -16,12 +16,11 @@
 #include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <cinttypes>
 #include <limits>
 #include <optional>
 
-extern "C" {
 #include "arm_nn_types.h"
-}
 
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
@@ -47,19 +46,19 @@ inline void validate_cmsis_nn_tensor_requirements(
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
-      "Input1 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input1.scalar_type());
+      "Input1 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input1.scalar_type()));
   ET_CHECK_MSG(
       input2.scalar_type() == expected_dtype,
-      "Input2 dtype must be %hhd, got %hhd",
-      expected_dtype,
-      input2.scalar_type());
+      "Input2 dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(input2.scalar_type()));
   ET_CHECK_MSG(
       output.scalar_type() == expected_dtype,
-      "Output dtype must be %hhd, got %hhd",
-      expected_dtype,
-      output.scalar_type());
+      "Output dtype must be %d, got %d",
+      static_cast<int>(expected_dtype),
+      static_cast<int>(output.scalar_type()));
   if (require_same_sizes) {
     ET_CHECK_MSG(
         input1.sizes() == input2.sizes(),
@@ -78,16 +77,17 @@ inline void validate_single_quant_params(
     const int64_t multiplier,
     const int64_t shift,
     const char* param_name) {
+  (void)zero_point;
   ET_CHECK_MSG(
       multiplier >= std::numeric_limits<int32_t>::min() &&
           multiplier <= std::numeric_limits<int32_t>::max(),
-      "%s multiplier must be in int32 range [Value: %d]",
+      "%s multiplier must be in int32 range [Value: %" PRIi64 "]",
       param_name,
       multiplier);
 
   ET_CHECK_MSG(
       shift >= -31 && shift <= 31,
-      "%s shift must be in range [-31, 31] [Value: %d]",
+      "%s shift must be in range [-31, 31] [Value: %" PRIi64 "]",
       param_name,
       shift);
 }
@@ -172,7 +172,7 @@ inline bool check_int32_within_range(
       value > std::numeric_limits<int32_t>::max()) {
     ET_LOG(
         Error,
-        "%s: %s value (%ld) exceeds int32_t range",
+        "%s: %s value (%" PRIi64 ") exceeds int32_t range",
         op_name,
         value_name,
         value);
@@ -354,14 +354,14 @@ inline bool validate_per_channel_quant_params(
     if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
       ET_LOG(
           Error,
-          "weight_multiplier[%d] out of CMSIS-NN range: %d",
+          "weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64,
           i,
           multipliers[i]);
       return false;
     }
     // Shift: {-31, 30} for arm_nn_requantize
     if (shifts[i] < -31 || shifts[i] > 30) {
-      ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
+      ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]);
       return false;
     }
   }
diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp
@@ -7,11 +7,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -27,7 +22,6 @@ Tensor& maximum_out(
       input2,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/false,
       /*require_same_sizes=*/false);
 
   auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -78,21 +72,32 @@ Tensor& maximum_out(
       static_cast<int32_t>(
           output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
 
-  const arm_cmsis_nn_status status = arm_maximum_s8(
-      /* ctx */ nullptr,
-      input1_data,
-      &input1_dims,
-      input2_data,
-      &input2_dims,
-      output_data,
-      &output_dims);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "maximum_out: arm_maximum_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
+  for (int32_t n = 0; n < output_dims.n; ++n) {
+    for (int32_t h = 0; h < output_dims.h; ++h) {
+      for (int32_t w = 0; w < output_dims.w; ++w) {
+        for (int32_t c = 0; c < output_dims.c; ++c) {
+          const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
+          const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
+          const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
+          const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
+          const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
+          const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
+          const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
+          const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
+          const int32_t idx1 =
+              ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
+              c1;
+          const int32_t idx2 =
+              ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
+              c2;
+          const int32_t out_idx =
+              ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
+          output_data[out_idx] = input1_data[idx1] > input2_data[idx2]
+              ? input1_data[idx1]
+              : input2_data[idx2];
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp
@@ -9,11 +9,6 @@
 
 #include "cortex_m_ops_common.h"
 
-// Include CMSIS-NN headers with C linkage
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -29,7 +24,6 @@ Tensor& minimum_out(
       input2,
       out,
       ScalarType::Char,
-      /*require_channels_last=*/false,
       /*require_same_sizes=*/false);
 
   auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -80,21 +74,32 @@ Tensor& minimum_out(
       static_cast<int32_t>(
           output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
 
-  const arm_cmsis_nn_status status = arm_minimum_s8(
-      /* ctx */ nullptr,
-      input1_data,
-      &input1_dims,
-      input2_data,
-      &input2_dims,
-      output_data,
-      &output_dims);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "minimum_out: arm_minimum_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
+  for (int32_t n = 0; n < output_dims.n; ++n) {
+    for (int32_t h = 0; h < output_dims.h; ++h) {
+      for (int32_t w = 0; w < output_dims.w; ++w) {
+        for (int32_t c = 0; c < output_dims.c; ++c) {
+          const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
+          const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
+          const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
+          const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
+          const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
+          const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
+          const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
+          const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
+          const int32_t idx1 =
+              ((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
+              c1;
+          const int32_t idx2 =
+              ((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
+              c2;
+          const int32_t out_idx =
+              ((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
+          output_data[out_idx] = input1_data[idx1] < input2_data[idx2]
+              ? input1_data[idx1]
+              : input2_data[idx2];
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp
@@ -8,10 +8,6 @@
 
 #include "cortex_m_ops_common.h"
 
-extern "C" {
-#include "arm_nnfunctions.h"
-}
-
 namespace cortex_m {
 namespace native {
 
@@ -74,21 +70,35 @@ Tensor& pad_out(
   const int8_t* input_data = input.const_data_ptr<int8_t>();
   int8_t* output_data = out.mutable_data_ptr<int8_t>();
 
-  const arm_cmsis_nn_status status = arm_pad_s8(
-      input_data,
-      output_data,
-      static_cast<int8_t>(pad_value),
-      &input_dims,
-      &cmsis_pre_pad,
-      &cmsis_post_pad);
-
-  if (status != ARM_CMSIS_NN_SUCCESS) {
-    ET_LOG(
-        Error,
-        "pad_out: arm_pad_s8 failed with status [%d]",
-        static_cast<int>(status));
-    context.fail(Error::Internal);
-    return out;
+  const int32_t out_n = input_dims.n + cmsis_pre_pad.n + cmsis_post_pad.n;
+  const int32_t out_h = input_dims.h + cmsis_pre_pad.h + cmsis_post_pad.h;
+  const int32_t out_w = input_dims.w + cmsis_pre_pad.w + cmsis_post_pad.w;
+  const int32_t out_c = input_dims.c + cmsis_pre_pad.c + cmsis_post_pad.c;
+
+  const int8_t pad_byte = static_cast<int8_t>(pad_value);
+  for (int32_t n = 0; n < out_n; ++n) {
+    for (int32_t h = 0; h < out_h; ++h) {
+      for (int32_t w = 0; w < out_w; ++w) {
+        for (int32_t c = 0; c < out_c; ++c) {
+          const int32_t out_idx = ((n * out_h + h) * out_w + w) * out_c + c;
+          const int32_t in_n = n - cmsis_pre_pad.n;
+          const int32_t in_h = h - cmsis_pre_pad.h;
+          const int32_t in_w = w - cmsis_pre_pad.w;
+          const int32_t in_c = c - cmsis_pre_pad.c;
+          if (in_n >= 0 && in_n < input_dims.n && in_h >= 0 &&
+              in_h < input_dims.h && in_w >= 0 && in_w < input_dims.w &&
+              in_c >= 0 && in_c < input_dims.c) {
+            const int32_t in_idx =
+                ((in_n * input_dims.h + in_h) * input_dims.w + in_w) *
+                    input_dims.c +
+                in_c;
+            output_data[out_idx] = input_data[in_idx];
+          } else {
+            output_data[out_idx] = pad_byte;
+          }
+        }
+      }
+    }
   }
 
   return out;
diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp
@@ -7,12 +7,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-// Include CMSIS-NN headers with C linkage
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp
@@ -5,11 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp
@@ -5,11 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp
@@ -5,11 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp
@@ -7,11 +7,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp
@@ -5,11 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp
@@ -5,12 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "cortex_m_ops_common.h"
-
-// Include CMSIS-NN headers with C linkage
-extern "C" {
 #include "arm_nnfunctions.h"
-}
+#include "cortex_m_ops_common.h"
 
 namespace cortex_m {
 namespace native {
diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp
diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp
diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp