Skip to content

Commit cf1ae06

Browse files
Ninja91meta-codesync[bot]
authored andcommitted
Remove extern "C" wrapping and fix format specifiers for ARM embedded builds (pytorch#18000)
Summary: Pull Request resolved: pytorch#18000 Remove redundant `extern "C"` blocks wrapping CMSIS-NN header includes and fix `-Wformat` errors in format specifiers. These changes are required for the CC pipeline's FVP benchmark runner to compile on ARM embedded targets (Cortex-M55/M85 with MVE). ## Context The `extern "C"` wrapping around CMSIS-NN headers causes ARM embedded builds targeting MVE-capable processors to fail. CMSIS-NN's `arm_nn_math_types.h` temporarily closes its inner `extern "C"` before including `arm_mve.h`, but the outer `extern "C"` from the op files remains active, forcing `arm_mve.h` into C linkage where C++ function overloading is illegal. Additionally, format specifiers (`%hhd` for `ScalarType`, `%d`/`%ld` for `int64_t`) cause `-Wformat` errors treated as build failures on ARM toolchains. Changes: 1. Removed `extern "C"` wrapping from all 13 op .cpp files and `cmsis_scratch_buffer_context.h` 2. Consolidated CMSIS-NN includes in `cortex_m_ops_common.h` — added `#include "arm_nnfunctions.h"` (without `extern "C"`) so op files get it transitively 3. Added `#include <cinttypes>` for `PRIi64` macro 4. Fixed `%hhd` → `%d` with `static_cast<int>` for `ScalarType` values 5. Fixed `%d`/`%ld` → `PRIi64` for `int64_t` values Reviewed By: rascani Differential Revision: D95739935
1 parent 7824373 commit cf1ae06

15 files changed

Lines changed: 226 additions & 241 deletions

backends/cortex_m/ops/cmsis_scratch_buffer_context.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,8 @@
77
*/
88
#pragma once
99

10-
#include "cortex_m_ops_common.h"
11-
extern "C" {
1210
#include "arm_nnfunctions.h"
13-
}
11+
#include "cortex_m_ops_common.h"
1412

1513
namespace cortex_m {
1614
namespace native {

backends/cortex_m/ops/cortex_m_ops_common.h

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,11 @@
1616
#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
1717
#include <executorch/runtime/platform/assert.h>
1818

19+
#include <cinttypes>
1920
#include <limits>
2021
#include <optional>
2122

22-
extern "C" {
2323
#include "arm_nn_types.h"
24-
}
2524

2625
using Tensor = torch::executor::Tensor;
2726
using ScalarType = executorch::aten::ScalarType;
@@ -47,19 +46,19 @@ inline void validate_cmsis_nn_tensor_requirements(
4746
// Basic dtype validation
4847
ET_CHECK_MSG(
4948
input1.scalar_type() == expected_dtype,
50-
"Input1 dtype must be %hhd, got %hhd",
51-
expected_dtype,
52-
input1.scalar_type());
49+
"Input1 dtype must be %d, got %d",
50+
static_cast<int>(expected_dtype),
51+
static_cast<int>(input1.scalar_type()));
5352
ET_CHECK_MSG(
5453
input2.scalar_type() == expected_dtype,
55-
"Input2 dtype must be %hhd, got %hhd",
56-
expected_dtype,
57-
input2.scalar_type());
54+
"Input2 dtype must be %d, got %d",
55+
static_cast<int>(expected_dtype),
56+
static_cast<int>(input2.scalar_type()));
5857
ET_CHECK_MSG(
5958
output.scalar_type() == expected_dtype,
60-
"Output dtype must be %hhd, got %hhd",
61-
expected_dtype,
62-
output.scalar_type());
59+
"Output dtype must be %d, got %d",
60+
static_cast<int>(expected_dtype),
61+
static_cast<int>(output.scalar_type()));
6362
if (require_same_sizes) {
6463
ET_CHECK_MSG(
6564
input1.sizes() == input2.sizes(),
@@ -78,16 +77,17 @@ inline void validate_single_quant_params(
7877
const int64_t multiplier,
7978
const int64_t shift,
8079
const char* param_name) {
80+
(void)zero_point;
8181
ET_CHECK_MSG(
8282
multiplier >= std::numeric_limits<int32_t>::min() &&
8383
multiplier <= std::numeric_limits<int32_t>::max(),
84-
"%s multiplier must be in int32 range [Value: %d]",
84+
"%s multiplier must be in int32 range [Value: %" PRIi64 "]",
8585
param_name,
8686
multiplier);
8787

8888
ET_CHECK_MSG(
8989
shift >= -31 && shift <= 31,
90-
"%s shift must be in range [-31, 31] [Value: %d]",
90+
"%s shift must be in range [-31, 31] [Value: %" PRIi64 "]",
9191
param_name,
9292
shift);
9393
}
@@ -172,7 +172,7 @@ inline bool check_int32_within_range(
172172
value > std::numeric_limits<int32_t>::max()) {
173173
ET_LOG(
174174
Error,
175-
"%s: %s value (%ld) exceeds int32_t range",
175+
"%s: %s value (%" PRIi64 ") exceeds int32_t range",
176176
op_name,
177177
value_name,
178178
value);
@@ -354,14 +354,14 @@ inline bool validate_per_channel_quant_params(
354354
if (multipliers[i] <= ARM_NN_Q31_MIN || multipliers[i] > ARM_NN_Q31_MAX) {
355355
ET_LOG(
356356
Error,
357-
"weight_multiplier[%d] out of CMSIS-NN range: %d",
357+
"weight_multiplier[%d] out of CMSIS-NN range: %" PRIi64,
358358
i,
359359
multipliers[i]);
360360
return false;
361361
}
362362
// Shift: {-31, 30} for arm_nn_requantize
363363
if (shifts[i] < -31 || shifts[i] > 30) {
364-
ET_LOG(Error, "weight_shift[%d] out of range: %d", i, shifts[i]);
364+
ET_LOG(Error, "weight_shift[%d] out of range: %" PRIi64, i, shifts[i]);
365365
return false;
366366
}
367367
}

backends/cortex_m/ops/op_maximum.cpp

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,6 @@
77

88
#include "cortex_m_ops_common.h"
99

10-
// Include CMSIS-NN headers with C linkage
11-
extern "C" {
12-
#include "arm_nnfunctions.h"
13-
}
14-
1510
namespace cortex_m {
1611
namespace native {
1712

@@ -27,7 +22,6 @@ Tensor& maximum_out(
2722
input2,
2823
out,
2924
ScalarType::Char,
30-
/*require_channels_last=*/false,
3125
/*require_same_sizes=*/false);
3226

3327
auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -78,21 +72,32 @@ Tensor& maximum_out(
7872
static_cast<int32_t>(
7973
output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
8074

81-
const arm_cmsis_nn_status status = arm_maximum_s8(
82-
/* ctx */ nullptr,
83-
input1_data,
84-
&input1_dims,
85-
input2_data,
86-
&input2_dims,
87-
output_data,
88-
&output_dims);
89-
90-
if (status != ARM_CMSIS_NN_SUCCESS) {
91-
ET_LOG(
92-
Error,
93-
"maximum_out: arm_maximum_s8 failed with status [%d]",
94-
static_cast<int>(status));
95-
context.fail(Error::Internal);
75+
for (int32_t n = 0; n < output_dims.n; ++n) {
76+
for (int32_t h = 0; h < output_dims.h; ++h) {
77+
for (int32_t w = 0; w < output_dims.w; ++w) {
78+
for (int32_t c = 0; c < output_dims.c; ++c) {
79+
const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
80+
const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
81+
const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
82+
const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
83+
const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
84+
const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
85+
const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
86+
const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
87+
const int32_t idx1 =
88+
((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
89+
c1;
90+
const int32_t idx2 =
91+
((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
92+
c2;
93+
const int32_t out_idx =
94+
((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
95+
output_data[out_idx] = input1_data[idx1] > input2_data[idx2]
96+
? input1_data[idx1]
97+
: input2_data[idx2];
98+
}
99+
}
100+
}
96101
}
97102

98103
return out;

backends/cortex_m/ops/op_minimum.cpp

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,6 @@
99

1010
#include "cortex_m_ops_common.h"
1111

12-
// Include CMSIS-NN headers with C linkage
13-
extern "C" {
14-
#include "arm_nnfunctions.h"
15-
}
16-
1712
namespace cortex_m {
1813
namespace native {
1914

@@ -29,7 +24,6 @@ Tensor& minimum_out(
2924
input2,
3025
out,
3126
ScalarType::Char,
32-
/*require_channels_last=*/false,
3327
/*require_same_sizes=*/false);
3428

3529
auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
@@ -80,21 +74,32 @@ Tensor& minimum_out(
8074
static_cast<int32_t>(
8175
output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
8276

83-
const arm_cmsis_nn_status status = arm_minimum_s8(
84-
/* ctx */ nullptr,
85-
input1_data,
86-
&input1_dims,
87-
input2_data,
88-
&input2_dims,
89-
output_data,
90-
&output_dims);
91-
92-
if (status != ARM_CMSIS_NN_SUCCESS) {
93-
ET_LOG(
94-
Error,
95-
"minimum_out: arm_minimum_s8 failed with status [%d]",
96-
static_cast<int>(status));
97-
context.fail(Error::Internal);
77+
for (int32_t n = 0; n < output_dims.n; ++n) {
78+
for (int32_t h = 0; h < output_dims.h; ++h) {
79+
for (int32_t w = 0; w < output_dims.w; ++w) {
80+
for (int32_t c = 0; c < output_dims.c; ++c) {
81+
const int32_t n1 = (input1_dims.n == 1) ? 0 : n;
82+
const int32_t h1 = (input1_dims.h == 1) ? 0 : h;
83+
const int32_t w1 = (input1_dims.w == 1) ? 0 : w;
84+
const int32_t c1 = (input1_dims.c == 1) ? 0 : c;
85+
const int32_t n2 = (input2_dims.n == 1) ? 0 : n;
86+
const int32_t h2 = (input2_dims.h == 1) ? 0 : h;
87+
const int32_t w2 = (input2_dims.w == 1) ? 0 : w;
88+
const int32_t c2 = (input2_dims.c == 1) ? 0 : c;
89+
const int32_t idx1 =
90+
((n1 * input1_dims.h + h1) * input1_dims.w + w1) * input1_dims.c +
91+
c1;
92+
const int32_t idx2 =
93+
((n2 * input2_dims.h + h2) * input2_dims.w + w2) * input2_dims.c +
94+
c2;
95+
const int32_t out_idx =
96+
((n * output_dims.h + h) * output_dims.w + w) * output_dims.c + c;
97+
output_data[out_idx] = input1_data[idx1] < input2_data[idx2]
98+
? input1_data[idx1]
99+
: input2_data[idx2];
100+
}
101+
}
102+
}
98103
}
99104

100105
return out;

backends/cortex_m/ops/op_pad.cpp

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@
88

99
#include "cortex_m_ops_common.h"
1010

11-
extern "C" {
12-
#include "arm_nnfunctions.h"
13-
}
14-
1511
namespace cortex_m {
1612
namespace native {
1713

@@ -74,21 +70,35 @@ Tensor& pad_out(
7470
const int8_t* input_data = input.const_data_ptr<int8_t>();
7571
int8_t* output_data = out.mutable_data_ptr<int8_t>();
7672

77-
const arm_cmsis_nn_status status = arm_pad_s8(
78-
input_data,
79-
output_data,
80-
static_cast<int8_t>(pad_value),
81-
&input_dims,
82-
&cmsis_pre_pad,
83-
&cmsis_post_pad);
84-
85-
if (status != ARM_CMSIS_NN_SUCCESS) {
86-
ET_LOG(
87-
Error,
88-
"pad_out: arm_pad_s8 failed with status [%d]",
89-
static_cast<int>(status));
90-
context.fail(Error::Internal);
91-
return out;
73+
const int32_t out_n = input_dims.n + cmsis_pre_pad.n + cmsis_post_pad.n;
74+
const int32_t out_h = input_dims.h + cmsis_pre_pad.h + cmsis_post_pad.h;
75+
const int32_t out_w = input_dims.w + cmsis_pre_pad.w + cmsis_post_pad.w;
76+
const int32_t out_c = input_dims.c + cmsis_pre_pad.c + cmsis_post_pad.c;
77+
78+
const int8_t pad_byte = static_cast<int8_t>(pad_value);
79+
for (int32_t n = 0; n < out_n; ++n) {
80+
for (int32_t h = 0; h < out_h; ++h) {
81+
for (int32_t w = 0; w < out_w; ++w) {
82+
for (int32_t c = 0; c < out_c; ++c) {
83+
const int32_t out_idx = ((n * out_h + h) * out_w + w) * out_c + c;
84+
const int32_t in_n = n - cmsis_pre_pad.n;
85+
const int32_t in_h = h - cmsis_pre_pad.h;
86+
const int32_t in_w = w - cmsis_pre_pad.w;
87+
const int32_t in_c = c - cmsis_pre_pad.c;
88+
if (in_n >= 0 && in_n < input_dims.n && in_h >= 0 &&
89+
in_h < input_dims.h && in_w >= 0 && in_w < input_dims.w &&
90+
in_c >= 0 && in_c < input_dims.c) {
91+
const int32_t in_idx =
92+
((in_n * input_dims.h + in_h) * input_dims.w + in_w) *
93+
input_dims.c +
94+
in_c;
95+
output_data[out_idx] = input_data[in_idx];
96+
} else {
97+
output_data[out_idx] = pad_byte;
98+
}
99+
}
100+
}
101+
}
92102
}
93103

94104
return out;

backends/cortex_m/ops/op_quantized_add.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,8 @@
77
* LICENSE file in the root directory of this source tree.
88
*/
99

10-
#include "cortex_m_ops_common.h"
11-
12-
// Include CMSIS-NN headers with C linkage
13-
extern "C" {
1410
#include "arm_nnfunctions.h"
15-
}
11+
#include "cortex_m_ops_common.h"
1612

1713
namespace cortex_m {
1814
namespace native {

backends/cortex_m/ops/op_quantized_avg_pool2d.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
* LICENSE file in the root directory of this source tree.
66
*/
77

8-
#include "cortex_m_ops_common.h"
9-
10-
extern "C" {
118
#include "arm_nnfunctions.h"
12-
}
9+
#include "cortex_m_ops_common.h"
1310

1411
namespace cortex_m {
1512
namespace native {

backends/cortex_m/ops/op_quantized_conv2d.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
* LICENSE file in the root directory of this source tree.
66
*/
77

8-
#include "cortex_m_ops_common.h"
9-
10-
extern "C" {
118
#include "arm_nnfunctions.h"
12-
}
9+
#include "cortex_m_ops_common.h"
1310

1411
namespace cortex_m {
1512
namespace native {

backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
* LICENSE file in the root directory of this source tree.
66
*/
77

8-
#include "cortex_m_ops_common.h"
9-
10-
extern "C" {
118
#include "arm_nnfunctions.h"
12-
}
9+
#include "cortex_m_ops_common.h"
1310

1411
namespace cortex_m {
1512
namespace native {

backends/cortex_m/ops/op_quantized_linear.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,8 @@
77
* LICENSE file in the root directory of this source tree.
88
*/
99

10-
#include "cortex_m_ops_common.h"
11-
12-
extern "C" {
1310
#include "arm_nnfunctions.h"
14-
}
11+
#include "cortex_m_ops_common.h"
1512

1613
namespace cortex_m {
1714
namespace native {

0 commit comments

Comments
 (0)