diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 37825a745..658d6460c 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -3787,7 +3787,7 @@ def hpat_pandas_series_mean_impl(self, axis=None, skipna=None, level=None, numer _skipna = skipna if _skipna: - return numpy.nanmean(self._data) + return numpy_like.nanmean(self._data) return self._data.mean() diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 4078cf555..cbdb43904 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -33,6 +33,7 @@ import numba import numpy +import numpy as np from numba import types, jit, prange, numpy_support, literally from numba.errors import TypingError @@ -472,3 +473,27 @@ def nanprod_impl(a): return c return nanprod_impl + + +def nanmean(a): + pass + + +@sdc_overload(nanmean) +def np_nanmean(a): + if not isinstance(a, types.Array): + return + isnan = get_isnan(a.dtype) + + def nanmean_impl(a): + c = 0.0 + count = 0 + for i in prange(len(a)): + v = a[i] + if not isnan(v): + c += v + count += 1 + # np.divide() doesn't raise ZeroDivisionError + return np.divide(c, count) + + return nanmean_impl diff --git a/sdc/tests/test_sdc_numpy.py b/sdc/tests/test_sdc_numpy.py index ece86bf6b..f2acd405b 100644 --- a/sdc/tests/test_sdc_numpy.py +++ b/sdc/tests/test_sdc_numpy.py @@ -264,6 +264,15 @@ def cases(): with self.subTest(data=case): np.testing.assert_array_equal(alt_cfunc(case), pyfunc(case)) + def test_nanmean(self): + def ref_impl(a): + return np.nanmean(a) + + def sdc_impl(a): + return numpy_like.nanmean(a) + + self.check_reduction_basic(ref_impl, sdc_impl) + def test_nanmin(self): def ref_impl(a): return np.nanmin(a) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 5e08b9c7b..1d6161cbf 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2410,58 +2410,44 @@ def test_impl(S): S = pd.Series(['aa', 'bb', np.nan]) self.assertEqual(hpat_func(S), test_impl(S)) + def _mean_data_samples(self): + yield [6, 6, 2, 1, 3, 3, 2, 1, 2] + yield [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2] + yield [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2] + yield [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf] + yield [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf] + yield [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2] + yield [np.nan, np.nan, np.nan] + yield [np.nan, np.nan, np.inf] + + def _check_mean(self, pyfunc, *args): + cfunc = self.jit(pyfunc) + + actual = cfunc(*args) + expected = pyfunc(*args) + if np.isnan(actual) or np.isnan(expected): + self.assertEqual(np.isnan(actual), np.isnan(expected)) + else: + self.assertEqual(actual, expected) + def test_series_mean(self): def test_impl(S): return S.mean() - hpat_func = self.jit(test_impl) - - data_samples = [ - [6, 6, 2, 1, 3, 3, 2, 1, 2], - [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], - [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2], - [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf], - [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf], - [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.inf], - ] - for data in data_samples: + for data in self._mean_data_samples(): with self.subTest(data=data): S = pd.Series(data) - actual = hpat_func(S) - expected = test_impl(S) - if np.isnan(actual) or np.isnan(expected): - self.assertEqual(np.isnan(actual), np.isnan(expected)) - else: - self.assertEqual(actual, expected) + self._check_mean(test_impl, S) @skip_sdc_jit("Series.mean() any parameters unsupported") def test_series_mean_skipna(self): def test_impl(S, skipna): return S.mean(skipna=skipna) - hpat_func = self.jit(test_impl) - - data_samples = [ - [6, 6, 2, 1, 3, 3, 2, 1, 2], - [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], - [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2], - [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf], - [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf], - [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.inf], - ] for skipna in [True, False]: - for data in data_samples: + for data in self._mean_data_samples(): S = pd.Series(data) - actual = hpat_func(S, skipna) - expected = test_impl(S, skipna) - if np.isnan(actual) or np.isnan(expected): - self.assertAlmostEqual(np.isnan(actual), np.isnan(expected)) - else: - self.assertAlmostEqual(actual, expected) + self._check_mean(test_impl, S, skipna) def test_series_var1(self): def test_impl(S): diff --git a/sdc/tests/tests_perf/test_perf_numpy.py b/sdc/tests/tests_perf/test_perf_numpy.py index 97b3f7135..1d928e35d 100644 --- a/sdc/tests/tests_perf/test_perf_numpy.py +++ b/sdc/tests/tests_perf/test_perf_numpy.py @@ -95,6 +95,11 @@ def _test_case(self, cases, name, total_data_length, data_num=1, input_data=test CE(type_='Numba', code='np.isnan(data)', jitted=True), CE(type_='SDC', code='sdc.functions.numpy_like.isnan(data)', jitted=True), ], usecase_params='data'), + TC(name='nanmean', size=[10 ** 8], call_expr=[ + CE(type_='Python', code='np.nanmean(data)', jitted=False), + CE(type_='Numba', code='np.nanmean(data)', jitted=True), + CE(type_='SDC', code='sdc.functions.numpy_like.nanmean(data)', jitted=True), + ], usecase_params='data'), TC(name='nansum', size=[10 ** 7], call_expr=[ CE(type_='Python', code='np.nansum(data)', jitted=False), CE(type_='SDC', code='sdc.functions.numpy_like.nansum(data)', jitted=True), diff --git a/sdc/tests/tests_perf/test_perf_series.py b/sdc/tests/tests_perf/test_perf_series.py index 286c8f67c..272d211bd 100644 --- a/sdc/tests/tests_perf/test_perf_series.py +++ b/sdc/tests/tests_perf/test_perf_series.py @@ -103,7 +103,8 @@ def _test_case(self, pyfunc, name, total_data_length, data_num=1, input_data=tes TC(name='map', size=[10 ** 7], params='{2.: 42., 4.: 3.14}'), TC(name='max', size=[10 ** 8], params='skipna=True'), TC(name='max', size=[10 ** 8], params='skipna=False'), - TC(name='mean', size=[10 ** 8]), + TC(name='mean', size=[10 ** 8], params='skipna=True'), + TC(name='mean', size=[10 ** 8], params='skipna=False'), TC(name='median', size=[10 ** 8]), TC(name='min', size=[10 ** 8], params='skipna=True'), TC(name='min', size=[10 ** 8], params='skipna=False'),