From 12da570f2e0e0a60c0fe462c165d1b8dc5afc581 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Sun, 16 Feb 2020 20:56:59 +0300 Subject: [PATCH 1/5] Add perf test for np.mean() --- sdc/tests/tests_perf/test_perf_numpy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdc/tests/tests_perf/test_perf_numpy.py b/sdc/tests/tests_perf/test_perf_numpy.py index 97b3f7135..df8d3e337 100644 --- a/sdc/tests/tests_perf/test_perf_numpy.py +++ b/sdc/tests/tests_perf/test_perf_numpy.py @@ -95,6 +95,10 @@ def _test_case(self, cases, name, total_data_length, data_num=1, input_data=test CE(type_='Numba', code='np.isnan(data)', jitted=True), CE(type_='SDC', code='sdc.functions.numpy_like.isnan(data)', jitted=True), ], usecase_params='data'), + TC(name='mean', size=[10 ** 8], call_expr=[ + CE(type_='Python', code='np.mean(data)', jitted=False), + CE(type_='Numba', code='np.mean(data)', jitted=True), + ], usecase_params='data'), TC(name='nansum', size=[10 ** 7], call_expr=[ CE(type_='Python', code='np.nansum(data)', jitted=False), CE(type_='SDC', code='sdc.functions.numpy_like.nansum(data)', jitted=True), From f89bb2260e6b7a452d3cb4986733d29fc61d6a4b Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Sun, 16 Feb 2020 22:16:37 +0300 Subject: [PATCH 2/5] Add numpy_like nanmean --- sdc/datatypes/hpat_pandas_series_functions.py | 2 +- sdc/functions/numpy_like.py | 25 +++++++++++++++++++ sdc/tests/tests_perf/test_perf_series.py | 3 ++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/sdc/datatypes/hpat_pandas_series_functions.py b/sdc/datatypes/hpat_pandas_series_functions.py index 37825a745..658d6460c 100644 --- a/sdc/datatypes/hpat_pandas_series_functions.py +++ b/sdc/datatypes/hpat_pandas_series_functions.py @@ -3787,7 +3787,7 @@ def hpat_pandas_series_mean_impl(self, axis=None, skipna=None, level=None, numer _skipna = skipna if _skipna: - return numpy.nanmean(self._data) + return numpy_like.nanmean(self._data) return self._data.mean() diff --git a/sdc/functions/numpy_like.py b/sdc/functions/numpy_like.py index 4078cf555..cbdb43904 100644 --- a/sdc/functions/numpy_like.py +++ b/sdc/functions/numpy_like.py @@ -33,6 +33,7 @@ import numba import numpy +import numpy as np from numba import types, jit, prange, numpy_support, literally from numba.errors import TypingError @@ -472,3 +473,27 @@ def nanprod_impl(a): return c return nanprod_impl + + +def nanmean(a): + pass + + +@sdc_overload(nanmean) +def np_nanmean(a): + if not isinstance(a, types.Array): + return + isnan = get_isnan(a.dtype) + + def nanmean_impl(a): + c = 0.0 + count = 0 + for i in prange(len(a)): + v = a[i] + if not isnan(v): + c += v + count += 1 + # np.divide() doesn't raise ZeroDivisionError + return np.divide(c, count) + + return nanmean_impl diff --git a/sdc/tests/tests_perf/test_perf_series.py b/sdc/tests/tests_perf/test_perf_series.py index 286c8f67c..272d211bd 100644 --- a/sdc/tests/tests_perf/test_perf_series.py +++ b/sdc/tests/tests_perf/test_perf_series.py @@ -103,7 +103,8 @@ def _test_case(self, pyfunc, name, total_data_length, data_num=1, input_data=tes TC(name='map', size=[10 ** 7], params='{2.: 42., 4.: 3.14}'), TC(name='max', size=[10 ** 8], params='skipna=True'), TC(name='max', size=[10 ** 8], params='skipna=False'), - TC(name='mean', size=[10 ** 8]), + TC(name='mean', size=[10 ** 8], params='skipna=True'), + TC(name='mean', size=[10 ** 8], params='skipna=False'), TC(name='median', size=[10 ** 8]), TC(name='min', size=[10 ** 8], params='skipna=True'), TC(name='min', size=[10 ** 8], params='skipna=False'), From bc40a96b9bf16ed8a58e6a113383dc4619c417a6 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 17 Feb 2020 14:22:46 +0300 Subject: [PATCH 3/5] Refactor tests for Series.mean() --- sdc/tests/test_series.py | 62 ++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/sdc/tests/test_series.py b/sdc/tests/test_series.py index 5e08b9c7b..1d6161cbf 100644 --- a/sdc/tests/test_series.py +++ b/sdc/tests/test_series.py @@ -2410,58 +2410,44 @@ def test_impl(S): S = pd.Series(['aa', 'bb', np.nan]) self.assertEqual(hpat_func(S), test_impl(S)) + def _mean_data_samples(self): + yield [6, 6, 2, 1, 3, 3, 2, 1, 2] + yield [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2] + yield [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2] + yield [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf] + yield [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf] + yield [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2] + yield [np.nan, np.nan, np.nan] + yield [np.nan, np.nan, np.inf] + + def _check_mean(self, pyfunc, *args): + cfunc = self.jit(pyfunc) + + actual = cfunc(*args) + expected = pyfunc(*args) + if np.isnan(actual) or np.isnan(expected): + self.assertEqual(np.isnan(actual), np.isnan(expected)) + else: + self.assertEqual(actual, expected) + def test_series_mean(self): def test_impl(S): return S.mean() - hpat_func = self.jit(test_impl) - - data_samples = [ - [6, 6, 2, 1, 3, 3, 2, 1, 2], - [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], - [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2], - [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf], - [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf], - [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.inf], - ] - for data in data_samples: + for data in self._mean_data_samples(): with self.subTest(data=data): S = pd.Series(data) - actual = hpat_func(S) - expected = test_impl(S) - if np.isnan(actual) or np.isnan(expected): - self.assertEqual(np.isnan(actual), np.isnan(expected)) - else: - self.assertEqual(actual, expected) + self._check_mean(test_impl, S) @skip_sdc_jit("Series.mean() any parameters unsupported") def test_series_mean_skipna(self): def test_impl(S, skipna): return S.mean(skipna=skipna) - hpat_func = self.jit(test_impl) - - data_samples = [ - [6, 6, 2, 1, 3, 3, 2, 1, 2], - [1.1, 0.3, 2.1, 1, 3, 0.3, 2.1, 1.1, 2.2], - [6, 6.1, 2.2, 1, 3, 3, 2.2, 1, 2], - [6, 6, np.nan, 2, np.nan, 1, 3, 3, np.inf, 2, 1, 2, np.inf], - [1.1, 0.3, np.nan, 1.0, np.inf, 0.3, 2.1, np.nan, 2.2, np.inf], - [1.1, 0.3, np.nan, 1, np.inf, 0, 1.1, np.nan, 2.2, np.inf, 2, 2], - [np.nan, np.nan, np.nan], - [np.nan, np.nan, np.inf], - ] for skipna in [True, False]: - for data in data_samples: + for data in self._mean_data_samples(): S = pd.Series(data) - actual = hpat_func(S, skipna) - expected = test_impl(S, skipna) - if np.isnan(actual) or np.isnan(expected): - self.assertAlmostEqual(np.isnan(actual), np.isnan(expected)) - else: - self.assertAlmostEqual(actual, expected) + self._check_mean(test_impl, S, skipna) def test_series_var1(self): def test_impl(S): From 624d844e599e4bae354f69fe03fbe5beae5da7f1 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 17 Feb 2020 14:28:27 +0300 Subject: [PATCH 4/5] Add tests for numpy_like.nanmean() --- sdc/tests/test_sdc_numpy.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sdc/tests/test_sdc_numpy.py b/sdc/tests/test_sdc_numpy.py index ece86bf6b..f2acd405b 100644 --- a/sdc/tests/test_sdc_numpy.py +++ b/sdc/tests/test_sdc_numpy.py @@ -264,6 +264,15 @@ def cases(): with self.subTest(data=case): np.testing.assert_array_equal(alt_cfunc(case), pyfunc(case)) + def test_nanmean(self): + def ref_impl(a): + return np.nanmean(a) + + def sdc_impl(a): + return numpy_like.nanmean(a) + + self.check_reduction_basic(ref_impl, sdc_impl) + def test_nanmin(self): def ref_impl(a): return np.nanmin(a) From cdb8a9e5a6473ae34e018f67a66c08aa679b48b8 Mon Sep 17 00:00:00 2001 From: Sergey Pokhodenko Date: Mon, 17 Feb 2020 15:03:01 +0300 Subject: [PATCH 5/5] Add perf test for nanmean --- sdc/tests/tests_perf/test_perf_numpy.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sdc/tests/tests_perf/test_perf_numpy.py b/sdc/tests/tests_perf/test_perf_numpy.py index df8d3e337..1d928e35d 100644 --- a/sdc/tests/tests_perf/test_perf_numpy.py +++ b/sdc/tests/tests_perf/test_perf_numpy.py @@ -95,9 +95,10 @@ def _test_case(self, cases, name, total_data_length, data_num=1, input_data=test CE(type_='Numba', code='np.isnan(data)', jitted=True), CE(type_='SDC', code='sdc.functions.numpy_like.isnan(data)', jitted=True), ], usecase_params='data'), - TC(name='mean', size=[10 ** 8], call_expr=[ - CE(type_='Python', code='np.mean(data)', jitted=False), - CE(type_='Numba', code='np.mean(data)', jitted=True), + TC(name='nanmean', size=[10 ** 8], call_expr=[ + CE(type_='Python', code='np.nanmean(data)', jitted=False), + CE(type_='Numba', code='np.nanmean(data)', jitted=True), + CE(type_='SDC', code='sdc.functions.numpy_like.nanmean(data)', jitted=True), ], usecase_params='data'), TC(name='nansum', size=[10 ** 7], call_expr=[ CE(type_='Python', code='np.nansum(data)', jitted=False),