-
-
Notifications
You must be signed in to change notification settings - Fork 26
Support cudf as a DataFrame backend #212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9875b27
3afa3b9
bead5cb
4381738
3d229a4
c947cfa
f3c466c
55d4a9b
489f6ef
06f3bf8
7d7a400
934bd03
65252c4
b2ebdf9
d907084
8573040
5c4c019
07934a2
6d308d7
b9624da
1f203f9
f502717
fc88402
5d927c1
c766ed8
7f87c8f
b8b3896
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,20 +1,25 @@ | ||
| import importlib | ||
| import os | ||
|
|
||
| import dask.dataframe as dd | ||
| import pandas as pd | ||
| import pytest | ||
| from dask import config | ||
| from dask.dataframe.utils import assert_eq | ||
|
|
||
| from dask_expr import from_dask_dataframe, from_pandas, optimize, read_csv, read_parquet | ||
| from dask_expr._expr import Expr, Lengths, Literal, Replace | ||
| from dask_expr._reductions import Len | ||
| from dask_expr.io import ReadParquet | ||
|
|
||
| # Import backend DataFrame library to test | ||
| BACKEND = config.get("dataframe.backend", "pandas") | ||
| lib = importlib.import_module(BACKEND) | ||
|
Comment on lines
+14
to
+16
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @phofl - Any opinions on using this approach as a "switch" to test the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Certainly interested in your thoughts as well @mrocklin
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is ok, although it will take me some time to get used to it😂 traveling today, will take a closer look at the whole pr when I am back |
||
|
|
||
|
|
||
| def _make_file(dir, format="parquet", df=None): | ||
| fn = os.path.join(str(dir), f"myfile.{format}") | ||
| if df is None: | ||
| df = pd.DataFrame({c: range(10) for c in "abcde"}) | ||
| df = lib.DataFrame({c: range(10) for c in "abcde"}) | ||
| if format == "csv": | ||
| df.to_csv(fn) | ||
| elif format == "parquet": | ||
|
|
@@ -83,7 +88,7 @@ def test_io_fusion(tmpdir, fmt): | |
|
|
||
|
|
||
| def test_predicate_pushdown(tmpdir): | ||
| original = pd.DataFrame( | ||
| original = lib.DataFrame( | ||
| { | ||
| "a": [1, 2, 3, 4, 5] * 10, | ||
| "b": [0, 1, 2, 3, 4] * 10, | ||
|
|
@@ -106,11 +111,11 @@ def test_predicate_pushdown(tmpdir): | |
| y_result = y.compute() | ||
| assert y_result.name == "b" | ||
| assert len(y_result) == 6 | ||
| assert all(y_result == 4) | ||
| assert (y_result == 4).all() | ||
|
|
||
|
|
||
| def test_predicate_pushdown_compound(tmpdir): | ||
| pdf = pd.DataFrame( | ||
| pdf = lib.DataFrame( | ||
| { | ||
| "a": [1, 2, 3, 4, 5] * 10, | ||
| "b": [0, 1, 2, 3, 4] * 10, | ||
|
|
@@ -134,15 +139,18 @@ def test_predicate_pushdown_compound(tmpdir): | |
| ) | ||
|
|
||
| # Test OR | ||
| x = df[(df.a == 5) | (df.c > 20)][df.b != 0]["b"] | ||
| x = df[(df.a == 5) | (df.c > 20)] | ||
| x = x[x.b != 0]["b"] | ||
| y = optimize(x, fuse=False) | ||
| assert isinstance(y.expr, ReadParquet) | ||
| filters = [set(y.filters[0]), set(y.filters[1])] | ||
| assert {("c", ">", 20), ("b", "!=", 0)} in filters | ||
| assert {("a", "==", 5), ("b", "!=", 0)} in filters | ||
| expect = pdf[(pdf.a == 5) | (pdf.c > 20)] | ||
| expect = expect[expect.b != 0]["b"] | ||
| assert_eq( | ||
| y, | ||
| pdf[(pdf.a == 5) | (pdf.c > 20)][pdf.b != 0]["b"], | ||
| expect, | ||
| check_index=False, | ||
| ) | ||
|
|
||
|
|
@@ -158,7 +166,7 @@ def test_predicate_pushdown_compound(tmpdir): | |
|
|
||
| @pytest.mark.parametrize("fmt", ["parquet", "csv", "pandas"]) | ||
| def test_io_culling(tmpdir, fmt): | ||
| pdf = pd.DataFrame({c: range(10) for c in "abcde"}) | ||
| pdf = lib.DataFrame({c: range(10) for c in "abcde"}) | ||
| if fmt == "parquet": | ||
| dd.from_pandas(pdf, 2).to_parquet(tmpdir) | ||
| df = read_parquet(tmpdir) | ||
|
|
@@ -191,23 +199,24 @@ def _check_culling(expr, partitions): | |
|
|
||
| @pytest.mark.parametrize("sort", [True, False]) | ||
| def test_from_pandas(sort): | ||
| pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| df = from_pandas(pdf, npartitions=2, sort=sort) | ||
|
|
||
| assert df.divisions == (0, 3, 5) if sort else (None,) * 3 | ||
| assert_eq(df, pdf) | ||
|
|
||
|
|
||
| def test_from_pandas_immutable(): | ||
| pdf = pd.DataFrame({"x": [1, 2, 3, 4]}) | ||
| pdf = lib.DataFrame({"x": [1, 2, 3, 4]}) | ||
| expected = pdf.copy() | ||
| df = from_pandas(pdf) | ||
| pdf["z"] = 100 | ||
| assert_eq(df, expected) | ||
|
|
||
|
|
||
| def test_parquet_complex_filters(tmpdir): | ||
| df = read_parquet(_make_file(tmpdir)) | ||
| with config.set({"dataframe.backend": BACKEND}): | ||
| df = read_parquet(_make_file(tmpdir)) | ||
| pdf = df.compute() | ||
| got = df["a"][df["b"] > df["b"].mean()] | ||
| expect = pdf["a"][pdf["b"] > pdf["b"].mean()] | ||
|
|
@@ -247,7 +256,7 @@ def test_from_dask_dataframe(optimize): | |
|
|
||
| @pytest.mark.parametrize("optimize", [True, False]) | ||
| def test_to_dask_dataframe(optimize): | ||
| pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| df = from_pandas(pdf, npartitions=2) | ||
| ddf = df.to_dask_dataframe(optimize=optimize) | ||
| assert isinstance(ddf, dd.DataFrame) | ||
|
|
@@ -256,7 +265,7 @@ def test_to_dask_dataframe(optimize): | |
|
|
||
| @pytest.mark.parametrize("write_metadata_file", [True, False]) | ||
| def test_to_parquet(tmpdir, write_metadata_file): | ||
| pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) | ||
| df = from_pandas(pdf, npartitions=2) | ||
|
|
||
| # Check basic parquet round trip | ||
|
|
@@ -277,7 +286,7 @@ def test_to_parquet(tmpdir, write_metadata_file): | |
|
|
||
|
|
||
| def test_combine_similar(tmpdir): | ||
| pdf = pd.DataFrame( | ||
| pdf = lib.DataFrame( | ||
| {"x": [0, 1, 2, 3] * 4, "y": range(16), "z": [None, 1, 2, 3] * 4} | ||
| ) | ||
| fn = _make_file(tmpdir, format="parquet", df=pdf) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@phofl - I ran into quite a few cases where the cudf backend was hanging (rather than quickly failing) for cases where a
cudf.DataFrame/Seriesdid not support the same attribute aspd.DataFrame/Series(e.g.nbytes,align, etc).What do you think about doing something like this so that we can more-quickly detect that the dependency is missing a necessary attribute?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yes I like this.