Skip to content
Closed
212 changes: 72 additions & 140 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,72 @@
# limitations under the License.
#

from collections.abc import Iterable
from functools import total_ordering
import itertools
import os
import re
import unittest
import sys

from sparktestsupport import SPARK_HOME

all_modules = []


def _get_module_from_name(name):
__import__(name)
return sys.modules[name]


def _discover_python_unittests(*paths, discover_slow=False):
"""Discover the python module which contains unittests under paths.

Such as:
['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
{'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}

Parameters
----------
paths : str
Paths of modules to be discovered.
discover_slow : bool
If True, will only discover slow tests
If False, will discover all tests except slow tests

Returns
-------
A set of complete test module name discovered under specified paths
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible to add a simple doctests at https://github.com/apache/spark/blob/master/dev/run-tests.py? The doctests there are ran before running the script.

Copy link
Copy Markdown
Member Author

@Yikun Yikun Jun 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Emm, looks like it a diffcult to add this doctest, the doctest requires real import of pyspark, so it would be failed when running the testcase in run-test.py.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk that's fine

"""

def add_test_module(testcases, modules, slow):
"""Append the testcases module names to modules set"""
if isinstance(testcases, Iterable):
for test_case in testcases:
add_test_module(test_case, modules, slow)
else:
name = testcases.__module__
module = _get_module_from_name(name)
if slow and hasattr(module, 'is_slow_test'):
modules.add(name)
if not slow and not hasattr(module, 'is_slow_test'):
modules.add(name)

if not paths:
return []
modules = set()
pyspark_path = os.path.join(SPARK_HOME, "python")
for path in paths:
# Discover the unittest in every path
testcases = unittest.defaultTestLoader.discover(
os.path.join(pyspark_path, path),
top_level_dir=pyspark_path
)
add_test_module(testcases, modules, discover_slow)

return sorted(list(modules))


@total_ordering
class Module(object):
"""
Expand Down Expand Up @@ -388,24 +446,7 @@ def __hash__(self):
"pyspark.profiler",
"pyspark.shuffle",
"pyspark.util",
# unittests
"pyspark.tests.test_appsubmit",
"pyspark.tests.test_broadcast",
"pyspark.tests.test_conf",
"pyspark.tests.test_context",
"pyspark.tests.test_daemon",
"pyspark.tests.test_install_spark",
"pyspark.tests.test_join",
"pyspark.tests.test_profiler",
"pyspark.tests.test_rdd",
"pyspark.tests.test_rddbarrier",
"pyspark.tests.test_readwrite",
"pyspark.tests.test_serializers",
"pyspark.tests.test_shuffle",
"pyspark.tests.test_taskcontext",
"pyspark.tests.test_util",
"pyspark.tests.test_worker",
]
] + _discover_python_unittests("pyspark/tests"),
)

pyspark_sql = Module(
Expand Down Expand Up @@ -437,32 +478,7 @@ def __hash__(self):
"pyspark.sql.pandas.serializers",
"pyspark.sql.pandas.typehints",
"pyspark.sql.pandas.utils",
# unittests
"pyspark.sql.tests.test_arrow",
"pyspark.sql.tests.test_catalog",
"pyspark.sql.tests.test_column",
"pyspark.sql.tests.test_conf",
"pyspark.sql.tests.test_context",
"pyspark.sql.tests.test_dataframe",
"pyspark.sql.tests.test_datasources",
"pyspark.sql.tests.test_functions",
"pyspark.sql.tests.test_group",
"pyspark.sql.tests.test_pandas_cogrouped_map",
"pyspark.sql.tests.test_pandas_grouped_map",
"pyspark.sql.tests.test_pandas_map",
"pyspark.sql.tests.test_pandas_udf",
"pyspark.sql.tests.test_pandas_udf_grouped_agg",
"pyspark.sql.tests.test_pandas_udf_scalar",
"pyspark.sql.tests.test_pandas_udf_typehints",
"pyspark.sql.tests.test_pandas_udf_window",
"pyspark.sql.tests.test_readwriter",
"pyspark.sql.tests.test_serde",
"pyspark.sql.tests.test_session",
"pyspark.sql.tests.test_streaming",
"pyspark.sql.tests.test_types",
"pyspark.sql.tests.test_udf",
"pyspark.sql.tests.test_utils",
]
] + _discover_python_unittests("pyspark/sql/tests"),
)


Expand All @@ -474,10 +490,7 @@ def __hash__(self):
source_file_regexes=[
"python/pyspark/resource"
],
python_test_goals=[
# unittests
"pyspark.resource.tests.test_resources",
]
python_test_goals=_discover_python_unittests("pyspark/resource/tests"),
)


Expand All @@ -494,12 +507,7 @@ def __hash__(self):
python_test_goals=[
# doctests
"pyspark.streaming.util",
Comment thread
HyukjinKwon marked this conversation as resolved.
# unittests
"pyspark.streaming.tests.test_context",
"pyspark.streaming.tests.test_dstream",
"pyspark.streaming.tests.test_kinesis",
"pyspark.streaming.tests.test_listener",
]
] + _discover_python_unittests("pyspark/streaming/tests"),
)


Expand All @@ -525,17 +533,10 @@ def __hash__(self):
"pyspark.mllib.stat.KernelDensity",
"pyspark.mllib.tree",
"pyspark.mllib.util",
# unittests
"pyspark.mllib.tests.test_algorithms",
"pyspark.mllib.tests.test_feature",
"pyspark.mllib.tests.test_linalg",
"pyspark.mllib.tests.test_stat",
"pyspark.mllib.tests.test_streaming_algorithms",
"pyspark.mllib.tests.test_util",
],
] + _discover_python_unittests("pyspark/mllib/tests"),
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
]
],
)


Expand All @@ -559,27 +560,13 @@ def __hash__(self):
"pyspark.ml.regression",
"pyspark.ml.stat",
"pyspark.ml.tuning",
# unittests
"pyspark.ml.tests.test_algorithms",
"pyspark.ml.tests.test_base",
"pyspark.ml.tests.test_evaluation",
"pyspark.ml.tests.test_feature",
"pyspark.ml.tests.test_image",
"pyspark.ml.tests.test_linalg",
"pyspark.ml.tests.test_param",
"pyspark.ml.tests.test_persistence",
"pyspark.ml.tests.test_pipeline",
"pyspark.ml.tests.test_stat",
"pyspark.ml.tests.test_training_summary",
"pyspark.ml.tests.test_tuning",
"pyspark.ml.tests.test_util",
"pyspark.ml.tests.test_wrapper",
],
] + _discover_python_unittests("pyspark/ml/tests"),
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
]
],
)


pyspark_pandas = Module(
name="pyspark-pandas",
dependencies=[pyspark_core, pyspark_sql],
Expand Down Expand Up @@ -614,59 +601,14 @@ def __hash__(self):
"pyspark.pandas.spark.accessors",
"pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints",
# unittests
"pyspark.pandas.tests.data_type_ops.test_base",
"pyspark.pandas.tests.data_type_ops.test_binary_ops",
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
"pyspark.pandas.tests.data_type_ops.test_categorical_ops",
"pyspark.pandas.tests.data_type_ops.test_complex_ops",
"pyspark.pandas.tests.data_type_ops.test_date_ops",
"pyspark.pandas.tests.data_type_ops.test_datetime_ops",
"pyspark.pandas.tests.data_type_ops.test_decimal_ops",
"pyspark.pandas.tests.data_type_ops.test_null_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
"pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly",
"pyspark.pandas.tests.plot.test_series_plot",
"pyspark.pandas.tests.plot.test_series_plot_matplotlib",
"pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
"pyspark.pandas.tests.test_csv",
"pyspark.pandas.tests.test_dataframe_conversion",
"pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_default_index",
"pyspark.pandas.tests.test_expanding",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_indexops_spark",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
"pyspark.pandas.tests.test_spark_functions",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.test_window",
],
] + _discover_python_unittests("pyspark/pandas/tests"),
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
]
# they aren't available there
],
)


pyspark_pandas_slow = Module(
name="pyspark-pandas-slow",
dependencies=[pyspark_core, pyspark_sql],
Expand All @@ -678,17 +620,7 @@ def __hash__(self):
"pyspark.pandas.frame",
"pyspark.pandas.generic",
"pyspark.pandas.series",
# unittests
"pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.indexes.test_datetime",
"pyspark.pandas.tests.test_dataframe",
"pyspark.pandas.tests.test_groupby",
"pyspark.pandas.tests.test_indexing",
"pyspark.pandas.tests.test_ops_on_diff_frames",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
"pyspark.pandas.tests.test_series",
"pyspark.pandas.tests.test_stats",
],
] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True),
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class IndexesTest(PandasOnSparkTestCase, TestUtils):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
@property
def fixed_freqs(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
from pyspark.pandas.utils import name_like_string


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class GroupByTest(PandasOnSparkTestCase, TestUtils):
def test_groupby_simple(self):
pdf = pd.DataFrame(
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class BasicIndexingTest(ComparisonTestBase):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_ops_on_diff_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
)


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod
def setUpClass(cls):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from pyspark.testing.sqlutils import SQLTestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod
def setUpClass(cls):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@
)


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of dev/sparktestsupport/modules.py
is_slow_test = True


class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
@property
def pser(self):
Expand Down
Loading