diff --git a/.github/workflows/check-urls.yml b/.github/workflows/check-urls.yml index f235903..8d576f4 100644 --- a/.github/workflows/check-urls.yml +++ b/.github/workflows/check-urls.yml @@ -43,5 +43,5 @@ jobs: timeout: 2 retry_count# : 2 # exclude_urls: https://hal.archives-ouvertes.fr/hal-00990252/document - # exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/ + exclude_patterns: https://circleci.com/gh/sdpython/pandas_streaming/ # force_pass : true diff --git a/.local.jenkins.lin.yml b/.local.jenkins.lin.yml index 9ab574d..cb0b755 100644 --- a/.local.jenkins.lin.yml +++ b/.local.jenkins.lin.yml @@ -9,7 +9,7 @@ virtualenv: install: - $PYINT -m pip install --upgrade pip - - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/ + - $PYINT -m pip install --upgrade --no-cache-dir --no-deps --index http://localhost:8067/simple/ jyquickhelper pandas_streaming --extra-index-url=https://pypi.python.org/simple/ - $PYINT -m pip install -r requirements.txt - $PYINT -m pip install -r requirements-dev.txt - $PYINT --version diff --git a/README.rst b/README.rst index 385a950..1e36110 100644 --- a/README.rst +++ b/README.rst @@ -12,8 +12,8 @@ pandas-streaming: streaming API over pandas :target: https://ci.appveyor.com/project/sdpython/pandas-streaming :alt: Build Status Windows -.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg - :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main +.. image:: https://dl.circleci.com/status-badge/img/gh/sdpython/pandas-streaming/tree/main.svg?style=svg + :target: https://dl.circleci.com/status-badge/redirect/gh/sdpython/pandas-streaming/tree/main .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ diff --git a/_doc/conf.py b/_doc/conf.py index 811119c..a125370 100644 --- a/_doc/conf.py +++ b/_doc/conf.py @@ -61,7 +61,7 @@ # The following is used by sphinx.ext.linkcode to provide links to github linkcode_resolve = make_linkcode_resolve( - "pandas_streaming", + "pandas-streaming", ( "https://github.com/sdpython/pandas-streaming/" "blob/{revision}/{package}/" diff --git a/_doc/index.rst b/_doc/index.rst index c731105..2ba1daa 100644 --- a/_doc/index.rst +++ b/_doc/index.rst @@ -16,8 +16,8 @@ pandas-streaming: streaming API over pandas :target: https://ci.appveyor.com/project/sdpython/pandas-streaming :alt: Build Status Windows -.. image:: https://circleci.com/gh/sdpython/pandas_streaming/tree/main.svg?style=svg - :target: https://circleci.com/gh/sdpython/pandas_streaming/tree/main +.. image:: https://dl.circleci.com/status-badge/img/gh/sdpython/pandas-streaming/tree/main.svg?style=svg + :target: https://dl.circleci.com/status-badge/redirect/gh/sdpython/pandas-streaming/tree/main .. image:: https://dev.azure.com/xavierdupre3/pandas_streaming/_apis/build/status/sdpython.pandas_streaming :target: https://dev.azure.com/xavierdupre3/pandas_streaming/ diff --git a/_unittests/ut_df/test_connex_split.py b/_unittests/ut_df/test_connex_split.py index e373c9b..2ff1cfe 100644 --- a/_unittests/ut_df/test_connex_split.py +++ b/_unittests/ut_df/test_connex_split.py @@ -1,6 +1,6 @@ import unittest import pandas -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import ( dataframe_shuffle, train_test_split_weights, diff --git a/_unittests/ut_df/test_connex_split_big.py b/_unittests/ut_df/test_connex_split_big.py index f297ec8..8378b08 100644 --- a/_unittests/ut_df/test_connex_split_big.py +++ b/_unittests/ut_df/test_connex_split_big.py @@ -3,7 +3,7 @@ import unittest from collections import Counter import pandas -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import train_test_connex_split diff --git a/_unittests/ut_df/test_connex_split_cat.py b/_unittests/ut_df/test_connex_split_cat.py index 3eb55e8..cf72d20 100644 --- a/_unittests/ut_df/test_connex_split_cat.py +++ b/_unittests/ut_df/test_connex_split_cat.py @@ -3,7 +3,7 @@ import unittest from collections import Counter import pandas -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import train_test_apart_stratify diff --git a/_unittests/ut_df/test_dataframe_helpers.py b/_unittests/ut_df/test_dataframe_helpers.py index edd0db6..8cdb3f2 100644 --- a/_unittests/ut_df/test_dataframe_helpers.py +++ b/_unittests/ut_df/test_dataframe_helpers.py @@ -2,7 +2,7 @@ import unittest import numpy import pandas -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import dataframe_hash_columns diff --git a/_unittests/ut_df/test_dataframe_helpers_simple.py b/_unittests/ut_df/test_dataframe_helpers_simple.py index 5d68296..b9bfc5c 100644 --- a/_unittests/ut_df/test_dataframe_helpers_simple.py +++ b/_unittests/ut_df/test_dataframe_helpers_simple.py @@ -1,7 +1,7 @@ import unittest import pandas import numpy -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import dataframe_unfold from pandas_streaming.df.dataframe_helpers import hash_int, hash_str, hash_float diff --git a/_unittests/ut_df/test_dataframe_io.py b/_unittests/ut_df/test_dataframe_io.py index 3e2125a..46ae322 100644 --- a/_unittests/ut_df/test_dataframe_io.py +++ b/_unittests/ut_df/test_dataframe_io.py @@ -1,10 +1,11 @@ import os +import tempfile import unittest import io import zipfile import numpy import pandas -from pyquickhelper.pycode import ExtTestCase, get_temp_folder +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import to_zip, read_zip @@ -20,43 +21,43 @@ def test_zip_dataframe(self): ] ) - temp = get_temp_folder(__file__, "temp_zip") - name = os.path.join(temp, "df.zip") - to_zip(df, name, encoding="utf-8", index=False) - df2 = read_zip(name, encoding="utf-8") - self.assertEqualDataFrame(df, df2) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "df.zip") + to_zip(df, name, encoding="utf-8", index=False) + df2 = read_zip(name, encoding="utf-8") + self.assertEqualDataFrame(df, df2) - st = io.BytesIO() - zp = zipfile.ZipFile(st, "w") - to_zip(df, zp, encoding="utf-8", index=False) - zp.close() + st = io.BytesIO() + zp = zipfile.ZipFile(st, "w") + to_zip(df, zp, encoding="utf-8", index=False) + zp.close() - st = io.BytesIO(st.getvalue()) - zp = zipfile.ZipFile(st, "r") - df3 = read_zip(zp, encoding="utf-8") - zp.close() - self.assertEqualDataFrame(df, df3) + st = io.BytesIO(st.getvalue()) + zp = zipfile.ZipFile(st, "r") + df3 = read_zip(zp, encoding="utf-8") + zp.close() + self.assertEqualDataFrame(df, df3) def test_zip_numpy(self): df = numpy.zeros((3, 4)) df[2, 3] = 1 - temp = get_temp_folder(__file__, "temp_zip") - name = os.path.join(temp, "df.zip") - to_zip(df, name, "arr.npy") - df2 = read_zip(name, "arr.npy") - self.assertEqualArray(df, df2) - - st = io.BytesIO() - zp = zipfile.ZipFile(st, "w") - to_zip(df, zp, "arr.npy") - zp.close() - - st = io.BytesIO(st.getvalue()) - zp = zipfile.ZipFile(st, "r") - df3 = read_zip(zp, "arr.npy") - zp.close() - self.assertEqualArray(df, df3) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "df.zip") + to_zip(df, name, "arr.npy") + df2 = read_zip(name, "arr.npy") + self.assertEqualArray(df, df2) + + st = io.BytesIO() + zp = zipfile.ZipFile(st, "w") + to_zip(df, zp, "arr.npy") + zp.close() + + st = io.BytesIO(st.getvalue()) + zp = zipfile.ZipFile(st, "r") + df3 = read_zip(zp, "arr.npy") + zp.close() + self.assertEqualArray(df, df3) if __name__ == "__main__": diff --git a/_unittests/ut_df/test_dataframe_io_helpers.py b/_unittests/ut_df/test_dataframe_io_helpers.py index 403a087..86a8bc4 100644 --- a/_unittests/ut_df/test_dataframe_io_helpers.py +++ b/_unittests/ut_df/test_dataframe_io_helpers.py @@ -2,7 +2,7 @@ from io import StringIO, BytesIO from json import loads import pandas -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df.dataframe_io_helpers import ( enumerate_json_items, JsonPerRowsStream, diff --git a/_unittests/ut_df/test_dataframe_sort.py b/_unittests/ut_df/test_dataframe_sort.py index 354e4d5..6083850 100644 --- a/_unittests/ut_df/test_dataframe_sort.py +++ b/_unittests/ut_df/test_dataframe_sort.py @@ -1,104 +1,105 @@ import os +import tempfile import unittest import pandas -from pyquickhelper.pycode import ExtTestCase, get_temp_folder +from pandas_streaming.ext_test_case import ExtTestCase from pandas_streaming.df import StreamingDataFrame class TestDataFrameSort(ExtTestCase): def test_sort_values(self): - temp = get_temp_folder(__file__, "temp_sort_values") - name = os.path.join(temp, "_data_") - df = pandas.DataFrame( - [ - dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5), - ] - ) - sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a") - res = sdf.sort_values(by="a", temp_file=name) - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "_data_") + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) + sdf = StreamingDataFrame.read_df(df, chunksize=2) + sorted_df = df.sort_values(by="a") + res = sdf.sort_values(by="a", temp_file=name) + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) def test_sort_values_twice(self): - temp = get_temp_folder(__file__, "temp_sort_values_twice") - name = os.path.join(temp, "_data_") - df = pandas.DataFrame( - [ - dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5), - ] - ) - sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a") - res = sdf.sort_values(by="a", temp_file=name) - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "_data_") + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) + sdf = StreamingDataFrame.read_df(df, chunksize=2) + sorted_df = df.sort_values(by="a") + res = sdf.sort_values(by="a", temp_file=name) + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) def test_sort_values_reverse(self): - temp = get_temp_folder(__file__, "temp_sort_values_reverse") - name = os.path.join(temp, "_data_") - df = pandas.DataFrame( - [ - dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(a=5, b="f", c=5.7, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5), - ] - ) - sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a", ascending=False) - res = sdf.sort_values(by="a", temp_file=name, ascending=False) - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "_data_") + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(a=5, b="f", c=5.7, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) + sdf = StreamingDataFrame.read_df(df, chunksize=2) + sorted_df = df.sort_values(by="a", ascending=False) + res = sdf.sort_values(by="a", temp_file=name, ascending=False) + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) def test_sort_values_nan_last(self): - temp = get_temp_folder(__file__, "temp_sort_values_nan_last") - name = os.path.join(temp, "_data_") - df = pandas.DataFrame( - [ - dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(b="f", c=5.8, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5), - ] - ) - sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a", na_position="last") - res = sdf.sort_values(by="a", temp_file=name, na_position="last") - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "_data_") + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(b="f", c=5.8, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) + sdf = StreamingDataFrame.read_df(df, chunksize=2) + sorted_df = df.sort_values(by="a", na_position="last") + res = sdf.sort_values(by="a", temp_file=name, na_position="last") + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) def test_sort_values_nan_first(self): - temp = get_temp_folder(__file__, "temp_sort_values_nan_first") - name = os.path.join(temp, "_data_") - df = pandas.DataFrame( - [ - dict(a=1, b="eé", c=5.6, ind="a1", ai=1), - dict(b="f", c=5.7, ind="a2", ai=2), - dict(b="f", c=5.8, ind="a2", ai=2), - dict(a=4, b="g", ind="a3", ai=3), - dict(a=8, b="h", c=5.9, ai=4), - dict(a=16, b="i", c=6.2, ind="a5", ai=5), - ] - ) - sdf = StreamingDataFrame.read_df(df, chunksize=2) - sorted_df = df.sort_values(by="a", na_position="first") - res = sdf.sort_values(by="a", temp_file=name, na_position="first") - res_df = res.to_df() - self.assertEqualDataFrame(sorted_df, res_df) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "_data_") + df = pandas.DataFrame( + [ + dict(a=1, b="eé", c=5.6, ind="a1", ai=1), + dict(b="f", c=5.7, ind="a2", ai=2), + dict(b="f", c=5.8, ind="a2", ai=2), + dict(a=4, b="g", ind="a3", ai=3), + dict(a=8, b="h", c=5.9, ai=4), + dict(a=16, b="i", c=6.2, ind="a5", ai=5), + ] + ) + sdf = StreamingDataFrame.read_df(df, chunksize=2) + sorted_df = df.sort_values(by="a", na_position="first") + res = sdf.sort_values(by="a", temp_file=name, na_position="first") + res_df = res.to_df() + self.assertEqualDataFrame(sorted_df, res_df) if __name__ == "__main__": diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py index 3d9a635..5379a06 100644 --- a/_unittests/ut_df/test_pandas_groupbynan.py +++ b/_unittests/ut_df/test_pandas_groupbynan.py @@ -2,7 +2,7 @@ import pandas import numpy from scipy.sparse.linalg import lsqr as sparse_lsqr -from pyquickhelper.pycode import ExtTestCase, ignore_warnings +from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings from pandas_streaming.df import pandas_groupby_nan, numpy_types diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py index b62f9a5..9d4e7d6 100644 --- a/_unittests/ut_df/test_streaming_dataframe.py +++ b/_unittests/ut_df/test_streaming_dataframe.py @@ -1,9 +1,10 @@ import os +import tempfile import unittest from io import StringIO import pandas import numpy -from pyquickhelper.pycode import ExtTestCase, get_temp_folder, ignore_warnings +from pandas_streaming.ext_test_case import ExtTestCase, ignore_warnings from pandas_streaming.data import dummy_streaming_dataframe from pandas_streaming.df import StreamingDataFrame from pandas_streaming.df.dataframe import StreamingDataFrameSchemaError @@ -56,39 +57,39 @@ def test_tail(self): self.assertEqual(st.shape, (10, 2)) def test_read_csv(self): - temp = get_temp_folder(__file__, "temp_read_csv") - df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) - name = os.path.join(temp, "df.csv") - name2 = os.path.join(temp, "df2.csv") - name3 = os.path.join(temp, "df3.csv") - df.to_csv(name, index=False) - df.to_csv(name2, index=True) - sdf = StreamingDataFrame.read_csv(name) - text = sdf.to_csv(index=False) - self.assertRaise( - lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None), - ValueError, - ) - self.assertRaise( - lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False), - ValueError, - ) - sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) - text2 = sdf2.to_csv(index=True) - sdf2.to_csv(name3, index=True) - with open(name, "r", encoding="utf-8") as f: - exp = f.read() - with open(name2, "r", encoding="utf-8") as f: - exp2 = f.read() - with open(name3, "r", encoding="utf-8") as f: - text3 = f.read() - self.assertEqual(text.replace("\r", ""), exp) - sdf2 = StreamingDataFrame.read_df(df) - self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) - self.assertEqual(text2.replace("\r", ""), exp2) - self.assertEqual( - text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "") - ) + with tempfile.TemporaryDirectory() as temp: + df = pandas.DataFrame(data=dict(a=[5, 6], b=["er", "r"])) + name = os.path.join(temp, "df.csv") + name2 = os.path.join(temp, "df2.csv") + name3 = os.path.join(temp, "df3.csv") + df.to_csv(name, index=False) + df.to_csv(name2, index=True) + sdf = StreamingDataFrame.read_csv(name) + text = sdf.to_csv(index=False) + self.assertRaise( + lambda: StreamingDataFrame.read_csv(name2, index_col=0, chunksize=None), + ValueError, + ) + self.assertRaise( + lambda: StreamingDataFrame.read_csv(name2, index_col=0, iterator=False), + ValueError, + ) + sdf2 = StreamingDataFrame.read_csv(name2, index_col=0) + text2 = sdf2.to_csv(index=True) + sdf2.to_csv(name3, index=True) + with open(name, "r", encoding="utf-8") as f: + exp = f.read() + with open(name2, "r", encoding="utf-8") as f: + exp2 = f.read() + with open(name3, "r", encoding="utf-8") as f: + text3 = f.read() + self.assertEqual(text.replace("\r", ""), exp) + sdf2 = StreamingDataFrame.read_df(df) + self.assertEqualDataFrame(sdf.to_dataframe(), sdf2.to_dataframe()) + self.assertEqual(text2.replace("\r", ""), exp2) + self.assertEqual( + text3.replace("\r", "").replace("\n\n", "\n"), exp2.replace("\r", "") + ) def test_where(self): sdf = dummy_streaming_dataframe(100) @@ -248,43 +249,43 @@ def test_train_test_split_streaming_strat(self): self.assertGreater(gr["cfloat"].min(), 4) def test_train_test_split_file(self): - temp = get_temp_folder(__file__, "temp_train_test_split_file") - names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] - sdf = dummy_streaming_dataframe(100) - sdf.train_test_split(names, index=False, streaming=False) - trsdf = StreamingDataFrame.read_csv(names[0]) - tesdf = StreamingDataFrame.read_csv(names[1]) - self.assertGreater(trsdf.shape[0], 20) - self.assertGreater(tesdf.shape[0], 20) - trdf = trsdf.to_dataframe() - tedf = tesdf.to_dataframe() - self.assertGreater(trdf.shape[0], 20) - self.assertGreater(tedf.shape[0], 20) - df_exp = sdf.to_dataframe() - df_val = pandas.concat([trdf, tedf]) - self.assertEqual(df_exp.shape, df_val.shape) - df_val = df_val.sort_values("cint").reset_index(drop=True) - self.assertEqualDataFrame(df_val, df_exp) + with tempfile.TemporaryDirectory() as temp: + names = [os.path.join(temp, "train.txt"), os.path.join(temp, "test.txt")] + sdf = dummy_streaming_dataframe(100) + sdf.train_test_split(names, index=False, streaming=False) + trsdf = StreamingDataFrame.read_csv(names[0]) + tesdf = StreamingDataFrame.read_csv(names[1]) + self.assertGreater(trsdf.shape[0], 20) + self.assertGreater(tesdf.shape[0], 20) + trdf = trsdf.to_dataframe() + tedf = tesdf.to_dataframe() + self.assertGreater(trdf.shape[0], 20) + self.assertGreater(tedf.shape[0], 20) + df_exp = sdf.to_dataframe() + df_val = pandas.concat([trdf, tedf]) + self.assertEqual(df_exp.shape, df_val.shape) + df_val = df_val.sort_values("cint").reset_index(drop=True) + self.assertEqualDataFrame(df_val, df_exp) def test_train_test_split_file_pattern(self): - temp = get_temp_folder(__file__, "temp_train_test_split_file_pattern") - sdf = dummy_streaming_dataframe(100) - names = os.path.join(temp, "spl_{0}.txt") - self.assertRaise( - lambda: sdf.train_test_split(names, index=False, streaming=False), - ValueError, - ) - names = os.path.join(temp, "spl_{}.txt") - tr, te = sdf.train_test_split(names, index=False, streaming=False) - trsdf = StreamingDataFrame.read_csv(tr) - tesdf = StreamingDataFrame.read_csv(te) - trdf = trsdf.to_dataframe() - tedf = tesdf.to_dataframe() - df_exp = sdf.to_dataframe() - df_val = pandas.concat([trdf, tedf]) - self.assertEqual(df_exp.shape, df_val.shape) - df_val = df_val.sort_values("cint").reset_index(drop=True) - self.assertEqualDataFrame(df_val, df_exp) + with tempfile.TemporaryDirectory() as temp: + sdf = dummy_streaming_dataframe(100) + names = os.path.join(temp, "spl_{0}.txt") + self.assertRaise( + lambda: sdf.train_test_split(names, index=False, streaming=False), + ValueError, + ) + names = os.path.join(temp, "spl_{}.txt") + tr, te = sdf.train_test_split(names, index=False, streaming=False) + trsdf = StreamingDataFrame.read_csv(tr) + tesdf = StreamingDataFrame.read_csv(te) + trdf = trsdf.to_dataframe() + tedf = tesdf.to_dataframe() + df_exp = sdf.to_dataframe() + df_val = pandas.concat([trdf, tedf]) + self.assertEqual(df_exp.shape, df_val.shape) + df_val = df_val.sort_values("cint").reset_index(drop=True) + self.assertEqualDataFrame(df_val, df_exp) def test_merge(self): def compares(a, b, how): @@ -328,18 +329,18 @@ def test_concatv(self): m1 = sdf20.concat(map(lambda x: x, [df30]), axis=0) self.assertEqualDataFrame(m1.to_dataframe(), df) - df30["g"] = 4 - self.assertRaise( - lambda: sdf20.concat(df30).to_dataframe(), - ValueError, - "Frame others[0] do not have the same column names", - ) df20["cint"] = df20["cint"].astype(float) self.assertRaise( lambda: sdf20.concat(df20).to_dataframe(), ValueError, "Frame others[0] do not have the same column types", ) + df30["g"] = 4 + self.assertRaise( + lambda: sdf20.concat(df30).to_dataframe(), + ValueError, + "Frame others[0] do not have the same column names", + ) def test_concath(self): sdf20 = dummy_streaming_dataframe(20) @@ -453,18 +454,18 @@ def test_schema_consistent(self): dict(cf=3, cint=3, cstr="3"), ] ) - temp = get_temp_folder(__file__, "temp_schema_consistant") - name = os.path.join(temp, "df.csv") - stio = StringIO() - df.to_csv(stio, index=False) - self.assertNotEmpty(stio.getvalue()) - df.to_csv(name, index=False) - self.assertEqual(df.shape, (4, 3)) - sdf = StreamingDataFrame.read_csv(name, chunksize=2) - self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) - sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False) - pieces = list(sdf) - self.assertEqual(len(pieces), 2) + with tempfile.TemporaryDirectory() as temp: + name = os.path.join(temp, "df.csv") + stio = StringIO() + df.to_csv(stio, index=False) + self.assertNotEmpty(stio.getvalue()) + df.to_csv(name, index=False) + self.assertEqual(df.shape, (4, 3)) + sdf = StreamingDataFrame.read_csv(name, chunksize=2) + self.assertRaise(lambda: list(sdf), StreamingDataFrameSchemaError) + sdf = StreamingDataFrame.read_csv(name, chunksize=2, check_schema=False) + pieces = list(sdf) + self.assertEqual(len(pieces), 2) def test_getitem(self): sdf = dummy_streaming_dataframe(100) @@ -513,12 +514,12 @@ def test_fillna(self): df2 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", "NAN"])) na = sdf.fillna(value=dict(X=10.0, Y="NAN")) ndf = na.to_df() - self.assertEqual(ndf, df2) + self.assertEqualDataFrame(ndf, df2) df3 = pandas.DataFrame(data=dict(X=[4.5, 10.0, 7], Y=["a", "b", numpy.nan])) na = sdf.fillna(value=dict(X=10.0)) ndf = na.to_df() - self.assertEqual(ndf, df3) + self.assertEqualDataFrame(ndf, df3) def test_describe(self): x = numpy.arange(100001).astype(numpy.float64) / 100000 - 0.5 @@ -531,12 +532,14 @@ def test_describe(self): self.assertEqual(["X", "Y"], list(desc.columns)) self.assertEqual(desc.loc["min", :].tolist(), [-0.5, 0]) self.assertEqual(desc.loc["max", :].tolist(), [0.5, 100000]) - self.assertEqualArray(desc.loc["mean", :], numpy.array([0, 50000]), atol=1e-8) + self.assertEqualArray( + desc.loc["mean", :], numpy.array([0, 50000], dtype=numpy.float64), atol=1e-8 + ) self.assertEqualArray(desc.loc["25%", :], numpy.array([-0.25, 25000])) self.assertEqualArray(desc.loc["50%", :], numpy.array([0.0, 50000])) self.assertEqualArray(desc.loc["75%", :], numpy.array([0.25, 75000])) self.assertEqualArray( - desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), decimal=4 + desc.loc["std", :], numpy.array([2.886795e-01, 28867.946472]), atol=1e-4 ) def test_set_item(self): diff --git a/_unittests/ut_module/test_sklearn.py b/_unittests/ut_module/test_sklearn.py index c8bdbfc..92d3888 100644 --- a/_unittests/ut_module/test_sklearn.py +++ b/_unittests/ut_module/test_sklearn.py @@ -2,7 +2,7 @@ import numpy import pandas from sklearn.linear_model import LogisticRegression -from pyquickhelper.pycode import ExtTestCase +from pandas_streaming.ext_test_case import ExtTestCase class TestScikitLearn(ExtTestCase): @@ -14,7 +14,7 @@ def test_logistic_regression_check(self): ) clq.fit(X, Y) pred2 = clq.predict(X) - self.assertEqual(numpy.array([0, 1]), pred2) + self.assertEqualArray(numpy.array([0, 1]), pred2) if __name__ == "__main__": diff --git a/pandas_streaming/ext_test_case.py b/pandas_streaming/ext_test_case.py new file mode 100644 index 0000000..dfd073d --- /dev/null +++ b/pandas_streaming/ext_test_case.py @@ -0,0 +1,191 @@ +import os +import sys +import unittest +import warnings +from contextlib import redirect_stderr, redirect_stdout +from io import StringIO +from typing import Any, Callable, List, Optional + +import numpy +from numpy.testing import assert_allclose + + +def unit_test_going(): + """ + Enables a flag telling the script is running while testing it. + Avois unit tests to be very long. + """ + going = int(os.environ.get("UNITTEST_GOING", 0)) + return going == 1 + + +def ignore_warnings(warns: List[Warning]) -> Callable: + """ + Catches warnings. + + :param warns: warnings to ignore + """ + + def wrapper(fct): + if warns is None: + raise AssertionError(f"warns cannot be None for '{fct}'.") + + def call_f(self): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", warns) + return fct(self) + + return call_f + + return wrapper + + +class sys_path_append: + """ + Stores the content of :epkg:`*py:sys:path` and + restores it afterwards. + """ + + def __init__(self, paths, position=-1): + """ + :param paths: paths to add + :param position: where to add it + """ + self.to_add = paths if isinstance(paths, list) else [paths] + self.position = position + + def __enter__(self): + """ + Modifies ``sys.path``. + """ + self.store = sys.path.copy() + if self.position == -1: + sys.path.extend(self.to_add) + else: + for p in reversed(self.to_add): + sys.path.insert(self.position, p) + + def __exit__(self, exc_type, exc_value, traceback): + """ + Restores``sys.path``. + """ + sys.path = self.store + + +class ExtTestCase(unittest.TestCase): + _warns = [] + + def assertExists(self, name): + if not os.path.exists(name): + raise AssertionError(f"File or folder {name!r} does not exists.") + + def assertEqualArray( + self, + expected: numpy.ndarray, + value: numpy.ndarray, + atol: float = 0, + rtol: float = 0, + ): + self.assertEqual(expected.dtype, value.dtype) + self.assertEqual(expected.shape, value.shape) + assert_allclose(expected, value, atol=atol, rtol=rtol) + + def assertEqualDataFrame(self, d1, d2, **kwargs): + """ + Checks that two dataframes are equal. + Calls :func:`pandas.testing.assert_frame_equal`. + """ + from pandas.testing import assert_frame_equal + + assert_frame_equal(d1, d2, **kwargs) + + def assertAlmostEqual( + self, + expected: numpy.ndarray, + value: numpy.ndarray, + atol: float = 0, + rtol: float = 0, + ): + if not isinstance(expected, numpy.ndarray): + expected = numpy.array(expected) + if not isinstance(value, numpy.ndarray): + value = numpy.array(value).astype(expected.dtype) + self.assertEqualArray(expected, value, atol=atol, rtol=rtol) + + def assertRaise( + self, fct: Callable, exc_type: Exception, msg: Optional[str] = None + ): + try: + fct() + except exc_type as e: + if not isinstance(e, exc_type): + raise AssertionError(f"Unexpected exception {type(e)!r}.") + if msg is None: + return + if msg not in str(e): + raise AssertionError(f"Unexpected error message {e!r}.") + return + raise AssertionError("No exception was raised.") + + def assertEmpty(self, value: Any): + if value is None: + return + if len(value) == 0: + return + raise AssertionError(f"value is not empty: {value!r}.") + + def assertNotEmpty(self, value: Any): + if value is None: + raise AssertionError(f"value is empty: {value!r}.") + if isinstance(value, (list, dict, tuple, set)): + if len(value) == 0: + raise AssertionError(f"value is empty: {value!r}.") + + def assertStartsWith(self, prefix: str, full: str): + if not full.startswith(prefix): + raise AssertionError(f"prefix={prefix!r} does not start string {full!r}.") + + def assertLesser(self, x, y, strict=False): + """ + Checks that ``x <= y``. + """ + if x > y or (strict and x == y): + raise AssertionError( + "x >{2} y with x={0} and y={1}".format( + ExtTestCase._format_str(x), + ExtTestCase._format_str(y), + "" if strict else "=", + ) + ) + + @staticmethod + def abs_path_join(filename: str, *args: List[str]): + """ + Returns an absolute and normalized path from this location. + + :param filename: filename, the folder which contains it + is used as the base + :param args: list of subpaths to the previous path + :return: absolute and normalized path + """ + dirname = os.path.join(os.path.dirname(filename), *args) + return os.path.normpath(os.path.abspath(dirname)) + + @classmethod + def tearDownClass(cls): + for name, line, w in cls._warns: + warnings.warn(f"\n{name}:{line}: {type(w)}\n {str(w)}") + + def capture(self, fct: Callable): + """ + Runs a function and capture standard output and error. + + :param fct: function to run + :return: result of *fct*, output, error + """ + sout = StringIO() + serr = StringIO() + with redirect_stdout(sout): + with redirect_stderr(serr): + res = fct() + return res, sout.getvalue(), serr.getvalue() diff --git a/requirements-dev.txt b/requirements-dev.txt index a10fbfd..679ba5a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,7 +14,6 @@ pycodestyle pylint>=2.14.0 pytest pytest-cov -pyquickhelper>=1.10 rstcheck[sphinx,toml] ruff scikit-learn