Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ jobs:
export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
# Hive and SQL tests become flaky when running in parallel as it's too intensive.
if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi
export PYTHONPATH="`pwd`/python/lib/py4j-0.10.9.2-src.zip:${PYTHONPATH}"
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
- name: Upload test results to report
if: always()
Expand Down Expand Up @@ -231,6 +232,7 @@ jobs:
run: |
export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }}
export PATH=$PATH:$HOME/miniconda/bin
export PYTHONPATH="`pwd`/python/lib/py4j-0.10.9.2-src.zip:${PYTHONPATH}"
./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST"
- name: Upload test results to report
if: always()
Expand Down
4 changes: 3 additions & 1 deletion dev/run-tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,9 @@ def main():
# run the test suites
run_scala_tests(build_tool, extra_profiles, test_modules, excluded_tags, included_tags)

modules_with_python_tests = [m for m in test_modules if m.python_test_goals]
modules_with_python_tests = [
m for m in test_modules if (m.python_test_goals or m.python_discover_paths)
]
if modules_with_python_tests:
# We only run PySpark tests with coverage report in one specific job with
# Spark master with SBT in Jenkins.
Expand Down
168 changes: 39 additions & 129 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class Module(object):
def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
environ=None, sbt_test_goals=(), python_test_goals=(),
excluded_python_implementations=(), test_tags=(), should_run_r_tests=False,
should_run_build_tests=False):
should_run_build_tests=False, python_discover_paths=()):
"""
Define a new module.

Expand All @@ -50,14 +50,17 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
:param environ: A dict of environment variables that should be set when files in this
module are changed.
:param sbt_test_goals: A set of SBT test goals for testing this module.
:param python_test_goals: A set of Python test goals for testing this module.
:param python_test_goals: A set of Python test goals for testing this module, note that the
unittests under the python_discover_paths would be discovered and appended to
python_test_goals.
:param excluded_python_implementations: A set of Python implementations that are not
supported by this module's Python components. The values in this set should match
strings returned by Python's `platform.python_implementation()`.
:param test_tags A set of tags that will be excluded when running unit tests if the module
is not explicitly changed.
:param should_run_r_tests: If true, changes in this module will trigger all R tests.
:param should_run_build_tests: If true, changes in this module will trigger build tests.
:param python_discover_paths: A set of Python unitests paths to be discovered.
"""
self.name = name
self.dependencies = dependencies
Expand All @@ -70,6 +73,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
self.test_tags = test_tags
self.should_run_r_tests = should_run_r_tests
self.should_run_build_tests = should_run_build_tests
self.python_discover_paths = python_discover_paths

self.dependent_modules = set()
for dep in dependencies:
Expand Down Expand Up @@ -388,24 +392,11 @@ def __hash__(self):
"pyspark.profiler",
"pyspark.shuffle",
"pyspark.util",
],
python_discover_paths=[
# unittests
"pyspark.tests.test_appsubmit",
"pyspark.tests.test_broadcast",
"pyspark.tests.test_conf",
"pyspark.tests.test_context",
"pyspark.tests.test_daemon",
"pyspark.tests.test_install_spark",
"pyspark.tests.test_join",
"pyspark.tests.test_profiler",
"pyspark.tests.test_rdd",
"pyspark.tests.test_rddbarrier",
"pyspark.tests.test_readwrite",
"pyspark.tests.test_serializers",
"pyspark.tests.test_shuffle",
"pyspark.tests.test_taskcontext",
"pyspark.tests.test_util",
"pyspark.tests.test_worker",
]
"pyspark/tests"
],
)

pyspark_sql = Module(
Expand Down Expand Up @@ -437,32 +428,11 @@ def __hash__(self):
"pyspark.sql.pandas.serializers",
"pyspark.sql.pandas.typehints",
"pyspark.sql.pandas.utils",
],
python_discover_paths=[
# unittests
"pyspark.sql.tests.test_arrow",
"pyspark.sql.tests.test_catalog",
"pyspark.sql.tests.test_column",
"pyspark.sql.tests.test_conf",
"pyspark.sql.tests.test_context",
"pyspark.sql.tests.test_dataframe",
"pyspark.sql.tests.test_datasources",
"pyspark.sql.tests.test_functions",
"pyspark.sql.tests.test_group",
"pyspark.sql.tests.test_pandas_cogrouped_map",
"pyspark.sql.tests.test_pandas_grouped_map",
"pyspark.sql.tests.test_pandas_map",
"pyspark.sql.tests.test_pandas_udf",
"pyspark.sql.tests.test_pandas_udf_grouped_agg",
"pyspark.sql.tests.test_pandas_udf_scalar",
"pyspark.sql.tests.test_pandas_udf_typehints",
"pyspark.sql.tests.test_pandas_udf_window",
"pyspark.sql.tests.test_readwriter",
"pyspark.sql.tests.test_serde",
"pyspark.sql.tests.test_session",
"pyspark.sql.tests.test_streaming",
"pyspark.sql.tests.test_types",
"pyspark.sql.tests.test_udf",
"pyspark.sql.tests.test_utils",
]
"pyspark/sql/tests"
],
)


Expand All @@ -474,10 +444,11 @@ def __hash__(self):
source_file_regexes=[
"python/pyspark/resource"
],
python_test_goals=[
python_test_goals=[],
python_discover_paths=[
# unittests
"pyspark.resource.tests.test_resources",
]
"pyspark/resource/tests"
],
)


Expand All @@ -494,12 +465,11 @@ def __hash__(self):
python_test_goals=[
# doctests
"pyspark.streaming.util",
],
python_discover_paths=[
# unittests
"pyspark.streaming.tests.test_context",
"pyspark.streaming.tests.test_dstream",
"pyspark.streaming.tests.test_kinesis",
"pyspark.streaming.tests.test_listener",
]
"pyspark/streaming/tests"
],
)


Expand All @@ -525,17 +495,14 @@ def __hash__(self):
"pyspark.mllib.stat.KernelDensity",
"pyspark.mllib.tree",
"pyspark.mllib.util",
],
python_discover_paths=[
# unittests
"pyspark.mllib.tests.test_algorithms",
"pyspark.mllib.tests.test_feature",
"pyspark.mllib.tests.test_linalg",
"pyspark.mllib.tests.test_stat",
"pyspark.mllib.tests.test_streaming_algorithms",
"pyspark.mllib.tests.test_util",
"pyspark/mllib/tests"
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
]
],
)


Expand All @@ -559,25 +526,14 @@ def __hash__(self):
"pyspark.ml.regression",
"pyspark.ml.stat",
"pyspark.ml.tuning",
],
python_discover_paths=[
# unittests
"pyspark.ml.tests.test_algorithms",
"pyspark.ml.tests.test_base",
"pyspark.ml.tests.test_evaluation",
"pyspark.ml.tests.test_feature",
"pyspark.ml.tests.test_image",
"pyspark.ml.tests.test_linalg",
"pyspark.ml.tests.test_param",
"pyspark.ml.tests.test_persistence",
"pyspark.ml.tests.test_pipeline",
"pyspark.ml.tests.test_stat",
"pyspark.ml.tests.test_training_summary",
"pyspark.ml.tests.test_tuning",
"pyspark.ml.tests.test_util",
"pyspark.ml.tests.test_wrapper",
"pyspark/ml/tests"
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there
]
],
)

pyspark_pandas = Module(
Expand Down Expand Up @@ -614,58 +570,18 @@ def __hash__(self):
"pyspark.pandas.spark.accessors",
"pyspark.pandas.spark.utils",
"pyspark.pandas.typedef.typehints",
],
python_discover_paths=[
# unittests
"pyspark.pandas.tests.data_type_ops.test_base",
"pyspark.pandas.tests.data_type_ops.test_binary_ops",
"pyspark.pandas.tests.data_type_ops.test_boolean_ops",
"pyspark.pandas.tests.data_type_ops.test_categorical_ops",
"pyspark.pandas.tests.data_type_ops.test_complex_ops",
"pyspark.pandas.tests.data_type_ops.test_date_ops",
"pyspark.pandas.tests.data_type_ops.test_datetime_ops",
"pyspark.pandas.tests.data_type_ops.test_null_ops",
"pyspark.pandas.tests.data_type_ops.test_num_ops",
"pyspark.pandas.tests.data_type_ops.test_string_ops",
"pyspark.pandas.tests.data_type_ops.test_udt_ops",
"pyspark.pandas.tests.indexes.test_category",
"pyspark.pandas.tests.plot.test_frame_plot",
"pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
"pyspark.pandas.tests.plot.test_frame_plot_plotly",
"pyspark.pandas.tests.plot.test_series_plot",
"pyspark.pandas.tests.plot.test_series_plot_matplotlib",
"pyspark.pandas.tests.plot.test_series_plot_plotly",
"pyspark.pandas.tests.test_categorical",
"pyspark.pandas.tests.test_config",
"pyspark.pandas.tests.test_csv",
"pyspark.pandas.tests.test_dataframe_conversion",
"pyspark.pandas.tests.test_dataframe_spark_io",
"pyspark.pandas.tests.test_default_index",
"pyspark.pandas.tests.test_expanding",
"pyspark.pandas.tests.test_extension",
"pyspark.pandas.tests.test_frame_spark",
"pyspark.pandas.tests.test_indexops_spark",
"pyspark.pandas.tests.test_internal",
"pyspark.pandas.tests.test_namespace",
"pyspark.pandas.tests.test_numpy_compat",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
"pyspark.pandas.tests.test_repr",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
"pyspark.pandas.tests.test_spark_functions",
"pyspark.pandas.tests.test_sql",
"pyspark.pandas.tests.test_typedef",
"pyspark.pandas.tests.test_utils",
"pyspark.pandas.tests.test_window",
"pyspark/pandas/tests"
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
]
# they aren't available there
],
)


pyspark_pandas_slow = Module(
name="pyspark-pandas-slow",
dependencies=[pyspark_core, pyspark_sql],
Expand All @@ -677,16 +593,10 @@ def __hash__(self):
"pyspark.pandas.frame",
"pyspark.pandas.generic",
"pyspark.pandas.series",
],
python_discover_paths=[
# unittests
"pyspark.pandas.tests.indexes.test_base",
"pyspark.pandas.tests.indexes.test_datetime",
"pyspark.pandas.tests.test_dataframe",
"pyspark.pandas.tests.test_groupby",
"pyspark.pandas.tests.test_indexing",
"pyspark.pandas.tests.test_ops_on_diff_frames",
"pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
"pyspark.pandas.tests.test_series",
"pyspark.pandas.tests.test_stats",
("pyspark/pandas/tests", "slow")
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class IndexesTest(PandasOnSparkTestCase, TestUtils):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/indexes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
@property
def fixed_freqs(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@
from pyspark.pandas.utils import name_like_string


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@
from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class GroupByTest(PandasOnSparkTestCase, TestUtils):
def test_groupby_simple(self):
pdf = pd.DataFrame(
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class BasicIndexingTest(ComparisonTestBase):
@property
def pdf(self):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_ops_on_diff_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
)


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod
def setUpClass(cls):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from pyspark.testing.sqlutils import SQLTestUtils


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
@classmethod
def setUpClass(cls):
Expand Down
5 changes: 5 additions & 0 deletions python/pyspark/pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@
)


# This is used in run-tests.py to discover the slow test. See more in the doc of
# _discover_python_unittests of python/run-tests.py
is_slow_test = True


class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
@property
def pser(self):
Expand Down
Loading