From 5a7e6e23ceda0d093f552117e543025b54078154 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Thu, 10 Jun 2021 21:20:14 +0800 Subject: [PATCH 1/8] auto test --- dev/sparktestsupport/modules.py | 205 +++++++++++--------------------- 1 file changed, 72 insertions(+), 133 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 4c4a8f866b7c1..b2f5c346be8ab 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -19,10 +19,28 @@ import itertools import os import re +import glob + +from sparktestsupport import SPARK_HOME all_modules = [] +def _discover_python_unittests(paths): + if not paths: + return set([]) + tests = set([]) + pyspark_path = os.path.join(SPARK_HOME, "python") + for path in paths: + # Discover the test*.py in every path + files = glob.glob(os.path.join(pyspark_path, path, "test*.py")) + for f in files: + # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc' + file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".") + tests.add(file2module) + return tests + + @total_ordering class Module(object): """ @@ -35,7 +53,7 @@ class Module(object): def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ=None, sbt_test_goals=(), python_test_goals=(), excluded_python_implementations=(), test_tags=(), should_run_r_tests=False, - should_run_build_tests=False): + should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()): """ Define a new module. @@ -58,6 +76,8 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= is not explicitly changed. :param should_run_r_tests: If true, changes in this module will trigger all R tests. :param should_run_build_tests: If true, changes in this module will trigger build tests. + :param python_test_paths: A set of python test paths to be discovered + :param python_excluded_tests: A set of excluded Python tests """ self.name = name self.dependencies = dependencies @@ -65,7 +85,10 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags self.environ = environ or {} - self.python_test_goals = python_test_goals + discovered_goals = _discover_python_unittests(python_test_paths) + # Final goals = Manual specified goals + Discoverd goals - Excluded goals + all_goals = set(python_test_goals) | set(discovered_goals) + self.python_test_goals = sorted(list(set(all_goals) - set(python_excluded_tests))) self.excluded_python_implementations = excluded_python_implementations self.test_tags = test_tags self.should_run_r_tests = should_run_r_tests @@ -382,24 +405,10 @@ def __hash__(self): "pyspark.profiler", "pyspark.shuffle", "pyspark.util", - # unittests - "pyspark.tests.test_appsubmit", - "pyspark.tests.test_broadcast", - "pyspark.tests.test_conf", - "pyspark.tests.test_context", - "pyspark.tests.test_daemon", - "pyspark.tests.test_install_spark", - "pyspark.tests.test_join", - "pyspark.tests.test_profiler", - "pyspark.tests.test_rdd", - "pyspark.tests.test_rddbarrier", - "pyspark.tests.test_readwrite", - "pyspark.tests.test_serializers", - "pyspark.tests.test_shuffle", - "pyspark.tests.test_taskcontext", - "pyspark.tests.test_util", - "pyspark.tests.test_worker", - ] + ], + python_test_paths=[ + "pyspark/tests" + ], ) pyspark_sql = Module( @@ -431,32 +440,10 @@ def __hash__(self): "pyspark.sql.pandas.serializers", "pyspark.sql.pandas.typehints", "pyspark.sql.pandas.utils", - # unittests - "pyspark.sql.tests.test_arrow", - "pyspark.sql.tests.test_catalog", - "pyspark.sql.tests.test_column", - "pyspark.sql.tests.test_conf", - "pyspark.sql.tests.test_context", - "pyspark.sql.tests.test_dataframe", - "pyspark.sql.tests.test_datasources", - "pyspark.sql.tests.test_functions", - "pyspark.sql.tests.test_group", - "pyspark.sql.tests.test_pandas_cogrouped_map", - "pyspark.sql.tests.test_pandas_grouped_map", - "pyspark.sql.tests.test_pandas_map", - "pyspark.sql.tests.test_pandas_udf", - "pyspark.sql.tests.test_pandas_udf_grouped_agg", - "pyspark.sql.tests.test_pandas_udf_scalar", - "pyspark.sql.tests.test_pandas_udf_typehints", - "pyspark.sql.tests.test_pandas_udf_window", - "pyspark.sql.tests.test_readwriter", - "pyspark.sql.tests.test_serde", - "pyspark.sql.tests.test_session", - "pyspark.sql.tests.test_streaming", - "pyspark.sql.tests.test_types", - "pyspark.sql.tests.test_udf", - "pyspark.sql.tests.test_utils", - ] + ], + python_test_paths=[ + "pyspark/sql/tests" + ], ) @@ -471,7 +458,10 @@ def __hash__(self): python_test_goals=[ # unittests "pyspark.resource.tests.test_resources", - ] + ], + python_test_paths=[ + "pyspark/resource/tests" + ], ) @@ -488,12 +478,10 @@ def __hash__(self): python_test_goals=[ # doctests "pyspark.streaming.util", - # unittests - "pyspark.streaming.tests.test_context", - "pyspark.streaming.tests.test_dstream", - "pyspark.streaming.tests.test_kinesis", - "pyspark.streaming.tests.test_listener", - ] + ], + python_test_paths=[ + "pyspark/streaming/tests" + ], ) @@ -519,17 +507,13 @@ def __hash__(self): "pyspark.mllib.stat.KernelDensity", "pyspark.mllib.tree", "pyspark.mllib.util", - # unittests - "pyspark.mllib.tests.test_algorithms", - "pyspark.mllib.tests.test_feature", - "pyspark.mllib.tests.test_linalg", - "pyspark.mllib.tests.test_stat", - "pyspark.mllib.tests.test_streaming_algorithms", - "pyspark.mllib.tests.test_util", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] + ], + python_test_paths=[ + "pyspark/mllib/tests" + ], ) @@ -553,27 +537,28 @@ def __hash__(self): "pyspark.ml.regression", "pyspark.ml.stat", "pyspark.ml.tuning", - # unittests - "pyspark.ml.tests.test_algorithms", - "pyspark.ml.tests.test_base", - "pyspark.ml.tests.test_evaluation", - "pyspark.ml.tests.test_feature", - "pyspark.ml.tests.test_image", - "pyspark.ml.tests.test_linalg", - "pyspark.ml.tests.test_param", - "pyspark.ml.tests.test_persistence", - "pyspark.ml.tests.test_pipeline", - "pyspark.ml.tests.test_stat", - "pyspark.ml.tests.test_training_summary", - "pyspark.ml.tests.test_tuning", - "pyspark.ml.tests.test_util", - "pyspark.ml.tests.test_wrapper", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there - ] + ], + python_test_paths=[ + "pyspark/ml/tests" + ], ) +pyspark_pandas_slow_unittests = [ + # unittests + "pyspark.pandas.tests.indexes.test_base", + "pyspark.pandas.tests.indexes.test_datetime", + "pyspark.pandas.tests.test_dataframe", + "pyspark.pandas.tests.test_groupby", + "pyspark.pandas.tests.test_indexing", + "pyspark.pandas.tests.test_ops_on_diff_frames", + "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", + "pyspark.pandas.tests.test_series", + "pyspark.pandas.tests.test_stats", +] + pyspark_pandas = Module( name="pyspark-pandas", dependencies=[pyspark_core, pyspark_sql], @@ -608,54 +593,18 @@ def __hash__(self): "pyspark.pandas.spark.accessors", "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", - # unittests - "pyspark.pandas.tests.data_type_ops.test_binary_ops", - "pyspark.pandas.tests.data_type_ops.test_boolean_ops", - "pyspark.pandas.tests.data_type_ops.test_categorical_ops", - "pyspark.pandas.tests.data_type_ops.test_complex_ops", - "pyspark.pandas.tests.data_type_ops.test_date_ops", - "pyspark.pandas.tests.data_type_ops.test_datetime_ops", - "pyspark.pandas.tests.data_type_ops.test_null_ops", - "pyspark.pandas.tests.data_type_ops.test_num_ops", - "pyspark.pandas.tests.data_type_ops.test_string_ops", - "pyspark.pandas.tests.data_type_ops.test_udt_ops", - "pyspark.pandas.tests.indexes.test_category", - "pyspark.pandas.tests.plot.test_frame_plot", - "pyspark.pandas.tests.plot.test_frame_plot_matplotlib", - "pyspark.pandas.tests.plot.test_frame_plot_plotly", - "pyspark.pandas.tests.plot.test_series_plot", - "pyspark.pandas.tests.plot.test_series_plot_matplotlib", - "pyspark.pandas.tests.plot.test_series_plot_plotly", - "pyspark.pandas.tests.test_categorical", - "pyspark.pandas.tests.test_config", - "pyspark.pandas.tests.test_csv", - "pyspark.pandas.tests.test_dataframe_conversion", - "pyspark.pandas.tests.test_dataframe_spark_io", - "pyspark.pandas.tests.test_default_index", - "pyspark.pandas.tests.test_expanding", - "pyspark.pandas.tests.test_extension", - "pyspark.pandas.tests.test_frame_spark", - "pyspark.pandas.tests.test_indexops_spark", - "pyspark.pandas.tests.test_internal", - "pyspark.pandas.tests.test_namespace", - "pyspark.pandas.tests.test_numpy_compat", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling", - "pyspark.pandas.tests.test_repr", - "pyspark.pandas.tests.test_reshape", - "pyspark.pandas.tests.test_rolling", - "pyspark.pandas.tests.test_series_conversion", - "pyspark.pandas.tests.test_series_datetime", - "pyspark.pandas.tests.test_series_string", - "pyspark.pandas.tests.test_sql", - "pyspark.pandas.tests.test_typedef", - "pyspark.pandas.tests.test_utils", - "pyspark.pandas.tests.test_window", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and - # they aren't available there - ] + # they aren't available there + ], + python_test_paths=[ + "pyspark/pandas/tests", + "pyspark/pandas/tests/data_type_ops", + "pyspark/pandas/tests/indexes", + "pyspark/pandas/tests/plot", + ], + python_excluded_tests=pyspark_pandas_slow_unittests, ) pyspark_pandas_slow = Module( @@ -669,17 +618,7 @@ def __hash__(self): "pyspark.pandas.frame", "pyspark.pandas.generic", "pyspark.pandas.series", - # unittests - "pyspark.pandas.tests.indexes.test_base", - "pyspark.pandas.tests.indexes.test_datetime", - "pyspark.pandas.tests.test_dataframe", - "pyspark.pandas.tests.test_groupby", - "pyspark.pandas.tests.test_indexing", - "pyspark.pandas.tests.test_ops_on_diff_frames", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", - "pyspark.pandas.tests.test_series", - "pyspark.pandas.tests.test_stats", - ], + ] + pyspark_pandas_slow_unittests, excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and # they aren't available there From 23f2c87c7ee16d433b3d59dbb71dae00203fea97 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Tue, 15 Jun 2021 10:28:41 +0800 Subject: [PATCH 2/8] Change glob rule from test* to test_* --- dev/sparktestsupport/modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index b2f5c346be8ab..2b4d72f49b130 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -33,7 +33,7 @@ def _discover_python_unittests(paths): pyspark_path = os.path.join(SPARK_HOME, "python") for path in paths: # Discover the test*.py in every path - files = glob.glob(os.path.join(pyspark_path, path, "test*.py")) + files = glob.glob(os.path.join(pyspark_path, path, "test_*.py")) for f in files: # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc' file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".") From 22015a526ae9fe66482d477fbc858ec6d73d18b4 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Wed, 16 Jun 2021 14:45:55 +0800 Subject: [PATCH 3/8] Change the 'test_* name discover' to 'unittest module discover' --- dev/sparktestsupport/modules.py | 54 ++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 11 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 2b4d72f49b130..170c0e20389c6 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -16,28 +16,61 @@ # from functools import total_ordering +from importlib import import_module +import inspect import itertools import os +from pkgutil import iter_modules import re -import glob +import sys +import unittest from sparktestsupport import SPARK_HOME + all_modules = [] +pyspark_path = os.path.join(SPARK_HOME, "python") +sys.path.append(pyspark_path) + + +def _contain_unittests_class(module_name): + """ + Check if the module with specific module_name has classes are derived from unittest.TestCase. + Such as: + pyspark.tests.test_appsubmit, it will return True, because there is SparkSubmitTests which is + included under the module of pyspark.tests.test_appsubmit, inherits from unittest.TestCase. + `` + :param module_name: the complete name of module to be checked. + :return: True if contains unittest classes otherwise False. + An ``ModuleNotFoundError`` will raise if the module is not found + """ + _module = import_module(module_name) + for _, _class in inspect.getmembers(_module, inspect.isclass): + if issubclass(_class, unittest.TestCase): + return True + return False def _discover_python_unittests(paths): + """ + Discover the python module which contains unittests under paths. + Such as: + ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like + {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...} + :param paths: paths of module to be discovered. + :return: A set of complete test module name discovered udner the paths + """ if not paths: - return set([]) - tests = set([]) - pyspark_path = os.path.join(SPARK_HOME, "python") + return set() + tests = set() + for path in paths: - # Discover the test*.py in every path - files = glob.glob(os.path.join(pyspark_path, path, "test_*.py")) - for f in files: - # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc' - file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".") - tests.add(file2module) + real_path = os.path.join(pyspark_path, path) + _prefix = path.replace('/', '.') + # iter modules under the specific tests path + for module in iter_modules([real_path], prefix=_prefix+'.'): + if _contain_unittests_class(module.name): + tests.add(module.name) return tests @@ -56,7 +89,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()): """ Define a new module. - :param name: A short module name, for display in logging and error messages. :param dependencies: A set of dependencies for this module. This should only include direct dependencies; transitive dependencies are resolved automatically. From 7cda24e3460cd9270fcdf3f866121ff9a69506dc Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Wed, 16 Jun 2021 18:27:48 +0800 Subject: [PATCH 4/8] Revert to test_* discover --- dev/sparktestsupport/modules.py | 53 +++++++-------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 170c0e20389c6..5b6bbf864dfa5 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -16,61 +16,28 @@ # from functools import total_ordering -from importlib import import_module -import inspect import itertools import os -from pkgutil import iter_modules import re -import sys -import unittest +import glob from sparktestsupport import SPARK_HOME - all_modules = [] -pyspark_path = os.path.join(SPARK_HOME, "python") -sys.path.append(pyspark_path) - - -def _contain_unittests_class(module_name): - """ - Check if the module with specific module_name has classes are derived from unittest.TestCase. - Such as: - pyspark.tests.test_appsubmit, it will return True, because there is SparkSubmitTests which is - included under the module of pyspark.tests.test_appsubmit, inherits from unittest.TestCase. - `` - :param module_name: the complete name of module to be checked. - :return: True if contains unittest classes otherwise False. - An ``ModuleNotFoundError`` will raise if the module is not found - """ - _module = import_module(module_name) - for _, _class in inspect.getmembers(_module, inspect.isclass): - if issubclass(_class, unittest.TestCase): - return True - return False def _discover_python_unittests(paths): - """ - Discover the python module which contains unittests under paths. - Such as: - ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like - {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...} - :param paths: paths of module to be discovered. - :return: A set of complete test module name discovered udner the paths - """ if not paths: - return set() - tests = set() - + return set([]) + tests = set([]) + pyspark_path = os.path.join(SPARK_HOME, "python") for path in paths: - real_path = os.path.join(pyspark_path, path) - _prefix = path.replace('/', '.') - # iter modules under the specific tests path - for module in iter_modules([real_path], prefix=_prefix+'.'): - if _contain_unittests_class(module.name): - tests.add(module.name) + # Discover the test*.py in every path + files = glob.glob(os.path.join(pyspark_path, path, "test_*.py")) + for f in files: + # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc' + file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".") + tests.add(file2module) return tests From 14ee8761ed79dc8fca8d02f594c9f7167b9ed523 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Thu, 17 Jun 2021 11:20:47 +0800 Subject: [PATCH 5/8] Change 'test_* discover' to 'unittest test loader discover' --- dev/sparktestsupport/modules.py | 40 ++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 5b6bbf864dfa5..a5bc5d0c851ba 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -19,7 +19,7 @@ import itertools import os import re -import glob +import unittest from sparktestsupport import SPARK_HOME @@ -27,18 +27,38 @@ def _discover_python_unittests(paths): + """ + Discover the python module which contains unittests under paths. + + Such as: + ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like + {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...} + + :param paths: paths of module to be discovered. + :return: A set of complete test module name discovered udner the paths + """ + + def add_suite(suite, modules): + """Gather the suite module names""" + if hasattr(suite, '__iter__'): + for test_case in suite: + add_suite(test_case, modules) + else: + modules.add(suite.__module__) + if not paths: return set([]) - tests = set([]) + modules = set([]) pyspark_path = os.path.join(SPARK_HOME, "python") for path in paths: - # Discover the test*.py in every path - files = glob.glob(os.path.join(pyspark_path, path, "test_*.py")) - for f in files: - # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc' - file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".") - tests.add(file2module) - return tests + # Discover the unittest in every path + suite = unittest.defaultTestLoader.discover( + os.path.join(pyspark_path, path), + top_level_dir=pyspark_path + ) + add_suite(suite, modules) + + return modules @total_ordering @@ -455,8 +475,6 @@ def __hash__(self): "python/pyspark/resource" ], python_test_goals=[ - # unittests - "pyspark.resource.tests.test_resources", ], python_test_paths=[ "pyspark/resource/tests" From b5871b06f23bc9abb4f83163bafc2f8e88101def Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Fri, 18 Jun 2021 23:19:22 +0800 Subject: [PATCH 6/8] Add _slow_test flag and address nits --- dev/sparktestsupport/modules.py | 123 ++++++++---------- .../pyspark/pandas/tests/indexes/test_base.py | 5 + .../pandas/tests/indexes/test_datetime.py | 5 + python/pyspark/pandas/tests/test_dataframe.py | 5 + python/pyspark/pandas/tests/test_groupby.py | 5 + python/pyspark/pandas/tests/test_indexing.py | 5 + .../pandas/tests/test_ops_on_diff_frames.py | 5 + .../tests/test_ops_on_diff_frames_groupby.py | 5 + python/pyspark/pandas/tests/test_series.py | 5 + python/pyspark/pandas/tests/test_stats.py | 5 + 10 files changed, 101 insertions(+), 67 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a5bc5d0c851ba..49b84716d07e2 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -20,35 +20,54 @@ import os import re import unittest +import sys from sparktestsupport import SPARK_HOME all_modules = [] -def _discover_python_unittests(paths): - """ - Discover the python module which contains unittests under paths. +def _get_module_from_name(name): + __import__(name) + return sys.modules[name] + + +def _discover_python_unittests(*paths, discover_slow=False): + """Discover the python module which contains unittests under paths. Such as: ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...} - :param paths: paths of module to be discovered. - :return: A set of complete test module name discovered udner the paths + Parameters + ---------- + paths : str + Paths of modules to be discovered. + discover_slow : bool + If True, will only discover slow tests + If False, will discover all tests except slow tests + + Returns + ------- + A set of complete test module name discovered under specified paths """ - def add_suite(suite, modules): + def add_suite(testcases, modules, slow): """Gather the suite module names""" - if hasattr(suite, '__iter__'): - for test_case in suite: - add_suite(test_case, modules) + if hasattr(testcases, '__iter__'): + for test_case in testcases: + add_suite(test_case, modules, slow) else: - modules.add(suite.__module__) + name = testcases.__module__ + module = _get_module_from_name(name) + if slow and hasattr(module, '_slow_test'): + modules.add(name) + if not slow and not hasattr(module, '_slow_test'): + modules.add(name) if not paths: - return set([]) - modules = set([]) + return [] + _modules = set() pyspark_path = os.path.join(SPARK_HOME, "python") for path in paths: # Discover the unittest in every path @@ -56,9 +75,9 @@ def add_suite(suite, modules): os.path.join(pyspark_path, path), top_level_dir=pyspark_path ) - add_suite(suite, modules) + add_suite(suite, _modules, discover_slow) - return modules + return sorted(list(_modules)) @total_ordering @@ -73,9 +92,10 @@ class Module(object): def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(), environ=None, sbt_test_goals=(), python_test_goals=(), excluded_python_implementations=(), test_tags=(), should_run_r_tests=False, - should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()): + should_run_build_tests=False): """ Define a new module. + :param name: A short module name, for display in logging and error messages. :param dependencies: A set of dependencies for this module. This should only include direct dependencies; transitive dependencies are resolved automatically. @@ -95,8 +115,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= is not explicitly changed. :param should_run_r_tests: If true, changes in this module will trigger all R tests. :param should_run_build_tests: If true, changes in this module will trigger build tests. - :param python_test_paths: A set of python test paths to be discovered - :param python_excluded_tests: A set of excluded Python tests """ self.name = name self.dependencies = dependencies @@ -104,10 +122,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags= self.sbt_test_goals = sbt_test_goals self.build_profile_flags = build_profile_flags self.environ = environ or {} - discovered_goals = _discover_python_unittests(python_test_paths) - # Final goals = Manual specified goals + Discoverd goals - Excluded goals - all_goals = set(python_test_goals) | set(discovered_goals) - self.python_test_goals = sorted(list(set(all_goals) - set(python_excluded_tests))) + self.python_test_goals = python_test_goals self.excluded_python_implementations = excluded_python_implementations self.test_tags = test_tags self.should_run_r_tests = should_run_r_tests @@ -424,10 +439,7 @@ def __hash__(self): "pyspark.profiler", "pyspark.shuffle", "pyspark.util", - ], - python_test_paths=[ - "pyspark/tests" - ], + ] + _discover_python_unittests("pyspark/tests"), ) pyspark_sql = Module( @@ -459,10 +471,7 @@ def __hash__(self): "pyspark.sql.pandas.serializers", "pyspark.sql.pandas.typehints", "pyspark.sql.pandas.utils", - ], - python_test_paths=[ - "pyspark/sql/tests" - ], + ] + _discover_python_unittests("pyspark/sql/tests"), ) @@ -474,11 +483,7 @@ def __hash__(self): source_file_regexes=[ "python/pyspark/resource" ], - python_test_goals=[ - ], - python_test_paths=[ - "pyspark/resource/tests" - ], + python_test_goals=_discover_python_unittests("pyspark/resource/tests"), ) @@ -495,10 +500,7 @@ def __hash__(self): python_test_goals=[ # doctests "pyspark.streaming.util", - ], - python_test_paths=[ - "pyspark/streaming/tests" - ], + ] + _discover_python_unittests("pyspark/streaming/tests"), ) @@ -524,13 +526,10 @@ def __hash__(self): "pyspark.mllib.stat.KernelDensity", "pyspark.mllib.tree", "pyspark.mllib.util", - ], + ] + _discover_python_unittests("pyspark/mllib/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there ], - python_test_paths=[ - "pyspark/mllib/tests" - ], ) @@ -554,27 +553,12 @@ def __hash__(self): "pyspark.ml.regression", "pyspark.ml.stat", "pyspark.ml.tuning", - ], + ] + _discover_python_unittests("pyspark/ml/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy and it isn't available there ], - python_test_paths=[ - "pyspark/ml/tests" - ], ) -pyspark_pandas_slow_unittests = [ - # unittests - "pyspark.pandas.tests.indexes.test_base", - "pyspark.pandas.tests.indexes.test_datetime", - "pyspark.pandas.tests.test_dataframe", - "pyspark.pandas.tests.test_groupby", - "pyspark.pandas.tests.test_indexing", - "pyspark.pandas.tests.test_ops_on_diff_frames", - "pyspark.pandas.tests.test_ops_on_diff_frames_groupby", - "pyspark.pandas.tests.test_series", - "pyspark.pandas.tests.test_stats", -] pyspark_pandas = Module( name="pyspark-pandas", @@ -610,20 +594,19 @@ def __hash__(self): "pyspark.pandas.spark.accessors", "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", - ], - excluded_python_implementations=[ - "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and - # they aren't available there - ], - python_test_paths=[ + ] + _discover_python_unittests( "pyspark/pandas/tests", "pyspark/pandas/tests/data_type_ops", "pyspark/pandas/tests/indexes", - "pyspark/pandas/tests/plot", + "pyspark/pandas/tests/plot" + ), + excluded_python_implementations=[ + "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and + # they aren't available there ], - python_excluded_tests=pyspark_pandas_slow_unittests, ) + pyspark_pandas_slow = Module( name="pyspark-pandas-slow", dependencies=[pyspark_core, pyspark_sql], @@ -635,7 +618,13 @@ def __hash__(self): "pyspark.pandas.frame", "pyspark.pandas.generic", "pyspark.pandas.series", - ] + pyspark_pandas_slow_unittests, + ] + _discover_python_unittests( + "pyspark/pandas/tests", + "pyspark/pandas/tests/data_type_ops", + "pyspark/pandas/tests/indexes", + "pyspark/pandas/tests/plot", + discover_slow=True + ), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and # they aren't available there diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index d7c0ace911b35..9dc10ecb8b64f 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -34,6 +34,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class IndexesTest(PandasOnSparkTestCase, TestUtils): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index 8a55e2efefdb1..e3e9b8f4cb885 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -25,6 +25,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils): @property def fixed_freqs(self): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index e54b7835c22ca..b42462262a982 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -50,6 +50,11 @@ from pyspark.pandas.utils import name_like_string +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 1bc182d8462f0..a88f2195943da 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -34,6 +34,11 @@ from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class GroupByTest(PandasOnSparkTestCase, TestUtils): def test_groupby_simple(self): pdf = pd.DataFrame( diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index b74cf90d079f9..dc0e8abe00307 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -27,6 +27,11 @@ from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class BasicIndexingTest(ComparisonTestBase): @property def pdf(self): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 12e87b202e699..5200a00d86240 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -35,6 +35,11 @@ ) +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index 70c3089c7025e..c14e1d74829ac 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -25,6 +25,11 @@ from pyspark.testing.sqlutils import SQLTestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): @classmethod def setUpClass(cls): diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 5fea71d6622bd..0ed2255440c01 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -44,6 +44,11 @@ ) +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): @property def pser(self): diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 1a386654076e9..7048dcf949ff5 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -31,6 +31,11 @@ from pyspark.testing.sqlutils import SQLTestUtils +# This is used in run-tests.py to discover the slow test. See more in the doc of +# _discover_python_unittests of dev/sparktestsupport/modules.py +_slow_test = True + + class StatsTest(PandasOnSparkTestCase, SQLTestUtils): def _test_stat_functions(self, pdf_or_pser, psdf_or_psser): functions = ["max", "min", "mean", "sum", "count"] From ef460a70716b3a4b4c9835b8d7fd73f839500e24 Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 28 Jun 2021 15:10:10 +0800 Subject: [PATCH 7/8] add doctest and fix nits --- dev/run-tests.py | 10 ++++- dev/sparktestsupport/modules.py | 45 +++++++++---------- .../pyspark/pandas/tests/indexes/test_base.py | 2 +- .../pandas/tests/indexes/test_datetime.py | 2 +- python/pyspark/pandas/tests/test_dataframe.py | 2 +- python/pyspark/pandas/tests/test_groupby.py | 2 +- python/pyspark/pandas/tests/test_indexing.py | 2 +- .../pandas/tests/test_ops_on_diff_frames.py | 2 +- .../tests/test_ops_on_diff_frames_groupby.py | 2 +- python/pyspark/pandas/tests/test_series.py | 2 +- python/pyspark/pandas/tests/test_stats.py | 2 +- 11 files changed, 40 insertions(+), 33 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index c3b074533c9f8..a440754263e2d 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -19,6 +19,7 @@ import itertools from argparse import ArgumentParser +import doctest import os import re import sys @@ -774,12 +775,19 @@ def main(): def _test(): - import doctest failure_count = doctest.testmod()[0] if failure_count: sys.exit(-1) +def _modules_doctest(): + # Running doctests in sparktestsupport.modules + failure_count = doctest.testmod(m=modules)[0] + if failure_count: + sys.exit(-1) + + if __name__ == "__main__": _test() + _modules_doctest() main() diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 49b84716d07e2..a455d1b184249 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -15,6 +15,7 @@ # limitations under the License. # +from collections.abc import Iterable from functools import total_ordering import itertools import os @@ -50,34 +51,43 @@ def _discover_python_unittests(*paths, discover_slow=False): Returns ------- A set of complete test module name discovered under specified paths + + >>> sorted([x for x in _discover_python_unittests('pyspark/tests')]) + ... # doctest: +NORMALIZE_WHITESPACE + ['pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', 'pyspark.tests.test_conf', + 'pyspark.tests.test_context', 'pyspark.tests.test_daemon', 'pyspark.tests.test_install_spark', + 'pyspark.tests.test_join', 'pyspark.tests.test_pin_thread', 'pyspark.tests.test_profiler', + 'pyspark.tests.test_rdd', 'pyspark.tests.test_rddbarrier', 'pyspark.tests.test_readwrite', + 'pyspark.tests.test_serializers', 'pyspark.tests.test_shuffle', + 'pyspark.tests.test_taskcontext', 'pyspark.tests.test_util', 'pyspark.tests.test_worker'] """ - def add_suite(testcases, modules, slow): - """Gather the suite module names""" - if hasattr(testcases, '__iter__'): + def add_test_module(testcases, modules, slow): + """Append the testcases module names to modules set""" + if isinstance(testcases, Iterable): for test_case in testcases: - add_suite(test_case, modules, slow) + add_test_module(test_case, modules, slow) else: name = testcases.__module__ module = _get_module_from_name(name) - if slow and hasattr(module, '_slow_test'): + if slow and hasattr(module, 'is_slow_test'): modules.add(name) - if not slow and not hasattr(module, '_slow_test'): + if not slow and not hasattr(module, 'is_slow_test'): modules.add(name) if not paths: return [] - _modules = set() + modules = set() pyspark_path = os.path.join(SPARK_HOME, "python") for path in paths: # Discover the unittest in every path - suite = unittest.defaultTestLoader.discover( + testcases = unittest.defaultTestLoader.discover( os.path.join(pyspark_path, path), top_level_dir=pyspark_path ) - add_suite(suite, _modules, discover_slow) + add_test_module(testcases, modules, discover_slow) - return sorted(list(_modules)) + return sorted(list(modules)) @total_ordering @@ -594,12 +604,7 @@ def __hash__(self): "pyspark.pandas.spark.accessors", "pyspark.pandas.spark.utils", "pyspark.pandas.typedef.typehints", - ] + _discover_python_unittests( - "pyspark/pandas/tests", - "pyspark/pandas/tests/data_type_ops", - "pyspark/pandas/tests/indexes", - "pyspark/pandas/tests/plot" - ), + ] + _discover_python_unittests("pyspark/pandas/tests"), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and # they aren't available there @@ -618,13 +623,7 @@ def __hash__(self): "pyspark.pandas.frame", "pyspark.pandas.generic", "pyspark.pandas.series", - ] + _discover_python_unittests( - "pyspark/pandas/tests", - "pyspark/pandas/tests/data_type_ops", - "pyspark/pandas/tests/indexes", - "pyspark/pandas/tests/plot", - discover_slow=True - ), + ] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True), excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and # they aren't available there diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py index c7667caea2ab5..2faad5effff0f 100644 --- a/python/pyspark/pandas/tests/indexes/test_base.py +++ b/python/pyspark/pandas/tests/indexes/test_base.py @@ -36,7 +36,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class IndexesTest(PandasOnSparkTestCase, TestUtils): diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py index e3e9b8f4cb885..79340127e3ebb 100644 --- a/python/pyspark/pandas/tests/indexes/test_datetime.py +++ b/python/pyspark/pandas/tests/indexes/test_datetime.py @@ -27,7 +27,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils): diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py index b42462262a982..858863cac4abf 100644 --- a/python/pyspark/pandas/tests/test_dataframe.py +++ b/python/pyspark/pandas/tests/test_dataframe.py @@ -52,7 +52,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils): diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index a88f2195943da..5ec93ca3d9772 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -36,7 +36,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class GroupByTest(PandasOnSparkTestCase, TestUtils): diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py index dc0e8abe00307..056d404bc161b 100644 --- a/python/pyspark/pandas/tests/test_indexing.py +++ b/python/pyspark/pandas/tests/test_indexing.py @@ -29,7 +29,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class BasicIndexingTest(ComparisonTestBase): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py index 5200a00d86240..db8beb7a23470 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py @@ -37,7 +37,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils): diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py index c14e1d74829ac..97efcf82257b7 100644 --- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py +++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py @@ -27,7 +27,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils): diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 0ed2255440c01..d0aa66fd57c05 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -46,7 +46,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class SeriesTest(PandasOnSparkTestCase, SQLTestUtils): diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index 7048dcf949ff5..06b35e042f3b0 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -33,7 +33,7 @@ # This is used in run-tests.py to discover the slow test. See more in the doc of # _discover_python_unittests of dev/sparktestsupport/modules.py -_slow_test = True +is_slow_test = True class StatsTest(PandasOnSparkTestCase, SQLTestUtils): From 64e0c0a6da6bc34d476b36a5c80d93a56571a7cd Mon Sep 17 00:00:00 2001 From: Yikun Jiang Date: Mon, 28 Jun 2021 21:53:21 +0800 Subject: [PATCH 8/8] Remove doctest --- dev/run-tests.py | 10 +--------- dev/sparktestsupport/modules.py | 9 --------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/dev/run-tests.py b/dev/run-tests.py index a440754263e2d..c3b074533c9f8 100755 --- a/dev/run-tests.py +++ b/dev/run-tests.py @@ -19,7 +19,6 @@ import itertools from argparse import ArgumentParser -import doctest import os import re import sys @@ -775,19 +774,12 @@ def main(): def _test(): + import doctest failure_count = doctest.testmod()[0] if failure_count: sys.exit(-1) -def _modules_doctest(): - # Running doctests in sparktestsupport.modules - failure_count = doctest.testmod(m=modules)[0] - if failure_count: - sys.exit(-1) - - if __name__ == "__main__": _test() - _modules_doctest() main() diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a455d1b184249..ac02ae44c1ccd 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -51,15 +51,6 @@ def _discover_python_unittests(*paths, discover_slow=False): Returns ------- A set of complete test module name discovered under specified paths - - >>> sorted([x for x in _discover_python_unittests('pyspark/tests')]) - ... # doctest: +NORMALIZE_WHITESPACE - ['pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', 'pyspark.tests.test_conf', - 'pyspark.tests.test_context', 'pyspark.tests.test_daemon', 'pyspark.tests.test_install_spark', - 'pyspark.tests.test_join', 'pyspark.tests.test_pin_thread', 'pyspark.tests.test_profiler', - 'pyspark.tests.test_rdd', 'pyspark.tests.test_rddbarrier', 'pyspark.tests.test_readwrite', - 'pyspark.tests.test_serializers', 'pyspark.tests.test_shuffle', - 'pyspark.tests.test_taskcontext', 'pyspark.tests.test_util', 'pyspark.tests.test_worker'] """ def add_test_module(testcases, modules, slow):