diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py index eb21e22..94482d5 100644 --- a/_unittests/ut_df/test_pandas_groupbynan.py +++ b/_unittests/ut_df/test_pandas_groupbynan.py @@ -49,7 +49,7 @@ def test_pandas_groupbynan(self): gr = pandas_groupby_nan(df, ("value", "this")) t = True raise AssertionError("---") - except TypeError: + except (TypeError, KeyError): t = False if t: co = gr.sum() @@ -91,7 +91,7 @@ def test_pandas_groupbynan_tuple(self): def test_pandas_groupbynan_regular(self): df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)]) - gr = df.groupby(["a"]).sum() + gr = df.groupby(["a"], as_index=False).sum() gr2_ = pandas_groupby_nan(df, ["a"]).sum() self.assertEqualDataFrame(gr, gr2_) @@ -99,9 +99,6 @@ def test_pandas_groupbynan_regular_nanback(self): df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)]) gr = df.groupby(["a", "cc"]).sum() self.assertEqual(len(gr), 1) - self.assertRaise( - lambda: pandas_groupby_nan(df, ["a", "cc"], nanback=True).sum(), - NotImplementedError) def test_pandas_groupbynan_doc(self): data = [dict(a=2, ind="a", n=1), @@ -132,10 +129,9 @@ def test_pandas_groupbynan_doc3(self): dict(a=3, ind="b"), dict(a=30)] df = pandas.DataFrame(data) - self.assertRaise(lambda: pandas_groupby_nan(df, ["ind", "n"]).sum(), - NotImplementedError) - # ind = list(gr2['ind']) - # self.assertTrue(numpy.isnan(ind[-1])) + gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum() + ind = list(gr2['ind']) + self.assertTrue(numpy.isnan(ind[-1])) if __name__ == "__main__": diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py index f293ac8..11fdc51 100644 --- a/_unittests/ut_df/test_streaming_dataframe.py +++ b/_unittests/ut_df/test_streaming_dataframe.py @@ -364,8 +364,8 @@ def test_groupby(self): # Do not replace lambda c:sum(c) by sum or... # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum - gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)]) - gr = sdf20.groupby("key", lambda gr: gr.agg( + gr2 = df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c:sum(c)]) + gr = sdf20.drop("cstr", axis=1).groupby("key", lambda gr: gr.agg( [numpy.sum, lambda c:sum(c)])) self.assertEqualDataFrame(gr, gr2) diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py index bfc5a69..843e4da 100644 --- a/pandas_streaming/df/dataframe.py +++ b/pandas_streaming/df/dataframe.py @@ -12,7 +12,10 @@ import numpy.random as nrandom import pandas from pandas.testing import assert_frame_equal -from pandas.io.json import json_normalize +try: + from pandas import json_normalize +except ImportError: + from pandas.io.json import json_normalize from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream @@ -609,6 +612,22 @@ def reservoir_iterate(sdf, indices, chunksize): return StreamingDataFrame( lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000)) + def drop(self, labels=None, *, axis=0, index=None, columns=None, level=None, + inplace=False, errors='raise') -> 'StreamingDataFrame': + """ + Applies :epkg:`pandas:DataFrame:drop`. + This function returns a @see cl StreamingDataFrame. + """ + if axis == 0: + raise NotImplementedError(f"drop is not implemented for axis={axis}.") + if inplace: + raise NotImplementedError(f"drop is not implemented for inplace={inplace}.") + return StreamingDataFrame( + lambda: map(lambda df: df.drop( + labels, axis=axis, index=index, columns=columns, + level=level, inplace=False, errors=errors), self), + **self.get_kwargs()) + def apply(self, *args, **kwargs) -> 'StreamingDataFrame': """ Applies :epkg:`pandas:DataFrame:apply`. @@ -1078,8 +1097,7 @@ def iterate_na(self, **kwargs): return StreamingDataFrame( lambda: iterate_na(self, **kwargs), **self.get_kwargs()) - def describe(self, percentiles=None, include=None, exclude=None, - datetime_is_numeric=False): + def describe(self, percentiles=None, include=None, exclude=None): """ Calls :epkg:`pandas:DataFrame:describe` on every piece of the datasets. *percentiles* are not really accurate @@ -1088,16 +1106,19 @@ def describe(self, percentiles=None, include=None, exclude=None, :param percentiles: see :epkg:`pandas:DataFrame:describe` :param include: see :epkg:`pandas:DataFrame:describe` :param exclude: see :epkg:`pandas:DataFrame:describe` - :param datetime_is_numeric: see :epkg:`pandas:DataFrame:describe` :return: :epkg:`pandas:DataFrame:describe` + + .. versionchanged:: 0.3.219 + + Parameter *datetime_is_numeric* was removed + (see :epkg:`pandas:DataFrame:describe`). """ merged = None stack = [] notper = ['count', 'mean', 'std'] for df in self: desc = df.describe( - percentiles=percentiles, include=include, exclude=exclude, - datetime_is_numeric=datetime_is_numeric) + percentiles=percentiles, include=include, exclude=exclude) count = desc.loc['count', :] rows = [name for name in desc.index if name not in notper] stack.append(desc.loc[rows, :]) @@ -1120,8 +1141,7 @@ def describe(self, percentiles=None, include=None, exclude=None, merged.loc['std', :] / merged.loc['count', :] - merged.loc['mean', :] ** 2) ** 0.5 values = pandas.concat(stack) - summary = values.describe(percentiles=percentiles, - datetime_is_numeric=datetime_is_numeric) + summary = values.describe(percentiles=percentiles) merged = merged.loc[notper, :] rows = [name for name in summary.index if name not in notper] summary = summary.loc[rows, :] diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py index ebfdeb6..3dc8f3a 100644 --- a/pandas_streaming/df/dataframe_helpers.py +++ b/pandas_streaming/df/dataframe_helpers.py @@ -7,7 +7,7 @@ import struct import warnings import numpy -from pandas import DataFrame, Index +from pandas import DataFrame, Index, Series def numpy_types(): @@ -389,6 +389,18 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True gr2 = pandas_groupby_nan(df, ["ind"]).sum() print(gr2) """ + if nanback and suffix is None: + try: + res = df.groupby(by, axis=axis, as_index=as_index, + dropna=False, **kwargs) + except TypeError: + # old version of pandas + res = None + if res is not None: + if suffix is None: + return res + res.index = Series(res.index).replace(numpy.nan, suffix) + return res if axis != 0: raise NotImplementedError("axis should be 0") if as_index: @@ -519,5 +531,4 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True # "Not implemented for type: {0}".format(type(grou.grouper))) # del res.grouper._cache return res - else: - return df.groupby(by, axis=axis, **kwargs) + return df.groupby(by, axis=axis, **kwargs)