diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 5e3040066..e7d4fa51b 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -63,12 +63,14 @@ from sdc.str_arr_type import (StringArrayType, string_array_type) from sdc.str_arr_ext import (str_arr_is_na, str_arr_set_na, num_total_chars, pre_alloc_string_array, cp_str_list_to_array, - create_str_arr_from_list, str_arr_set_na_by_mask) + create_str_arr_from_list, str_arr_set_na_by_mask, + str_list_to_array) from sdc.utilities.utils import to_array, sdc_overload, sdc_overload_method, sdc_overload_attribute from sdc import sdc_autogenerated from sdc.functions import numpy_like from sdc.hiframes.api import isna from sdc.datatypes.hpat_pandas_groupby_functions import init_series_groupby +from sdc.utilities.prange_utils import parallel_chunks from .pandas_series_functions import apply from .pandas_series_functions import map as _map @@ -2048,7 +2050,7 @@ def hpat_pandas_series_isin(self, values): Pandas Series method :meth:`pandas.Series.isin` implementation. .. only:: developer - Test: python -m sdc.runtests sdc.tests.test_series.TestSeries.test_series_isin_list1 + Test: python -m sdc.runtests -k sdc.tests.test_series.TestSeries.test_series_isin* """ _func_name = 'Method isin().' @@ -2059,10 +2061,31 @@ def hpat_pandas_series_isin(self, values): if not isinstance(values, (types.Set, types.List)): ty_checker.raise_exc(values, 'set or list', 'values') - def hpat_pandas_series_isin_impl(self, values): - # TODO: replace with below line when Numba supports np.isin in nopython mode - # return pandas.Series(np.isin(self._data, values)) - return pandas.Series(data=[(x in values) for x in self._data], index=self._index, name=self._name) + if isinstance(values.dtype, (types.UnicodeType, types.StringLiteral)): + def hpat_pandas_series_isin_impl(self, values): + # TODO: replace with below line when Numba supports np.isin in nopython mode + # return pandas.Series (np.isin (self._data, values)) + + values = str_list_to_array(list(values)) + values = set(values) + data_len = len(self._data) + result = numpy.empty(data_len, dtype=numpy.bool_) + for i in prange(data_len): + result[i] = self._data[i] in values + + return pandas.Series(data=result, index=self._index, name=self._name) + else: + def hpat_pandas_series_isin_impl(self, values): + # TODO: replace with below line when Numba supports np.isin in nopython mode + # return pandas.Series (np.isin (self._data, values)) + + values = set(values) + data_len = len(self._data) + result = numpy.empty(data_len, dtype=numpy.bool_) + for i in prange(data_len): + result[i] = self._data[i] in values + + return pandas.Series(data=result, index=self._index, name=self._name) return hpat_pandas_series_isin_impl diff --git a/sdc/tests/tests_perf/test_perf_series.py b/sdc/tests/tests_perf/test_perf_series.py index d99396220..8d2c86a67 100644 --- a/sdc/tests/tests_perf/test_perf_series.py +++ b/sdc/tests/tests_perf/test_perf_series.py @@ -107,7 +107,9 @@ def _test_case(self, pyfunc, name, total_data_length, input_data=None, data_num= TC(name='idxmin', size=[10 ** 8], check_skipna=True), TC(name='iloc', size=[10 ** 7], call_expr='data.iloc[100000]', usecase_params='data'), TC(name='index', size=[10 ** 7], call_expr='data.index', usecase_params='data'), - TC(name='isin', size=[10 ** 7], call_expr='data.isin([0])', usecase_params='data'), + TC(name='isin', size=[10 ** 7], params='values=[0]'), + TC(name='isin', size=[10 ** 7], call_expr='data.isin(["a", "q", "c", "q", "d", "q", "e"])', usecase_params='data', + input_data=[['a', 'b', 'q', 'w', 'c', 'd', 'e', 'r']]), TC(name='isna', size=[10 ** 7]), TC(name='isnull', size=[10 ** 7]), TC(name='le', size=[10 ** 7], params='other', data_num=2),