From 5a7e6e23ceda0d093f552117e543025b54078154 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Thu, 10 Jun 2021 21:20:14 +0800
Subject: [PATCH 1/8] auto test

---
 dev/sparktestsupport/modules.py | 205 +++++++++++---------------------
 1 file changed, 72 insertions(+), 133 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 4c4a8f866b7c1..b2f5c346be8ab 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -19,10 +19,28 @@
 import itertools
 import os
 import re
+import glob
+
+from sparktestsupport import SPARK_HOME
 
 all_modules = []
 
 
+def _discover_python_unittests(paths):
+    if not paths:
+        return set([])
+    tests = set([])
+    pyspark_path = os.path.join(SPARK_HOME, "python")
+    for path in paths:
+        # Discover the test*.py in every path
+        files = glob.glob(os.path.join(pyspark_path, path, "test*.py"))
+        for f in files:
+            # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc'
+            file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".")
+            tests.add(file2module)
+    return tests
+
+
 @total_ordering
 class Module(object):
     """
@@ -35,7 +53,7 @@ class Module(object):
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
                  environ=None, sbt_test_goals=(), python_test_goals=(),
                  excluded_python_implementations=(), test_tags=(), should_run_r_tests=False,
-                 should_run_build_tests=False):
+                 should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()):
         """
         Define a new module.
 
@@ -58,6 +76,8 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
             is not explicitly changed.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         :param should_run_build_tests: If true, changes in this module will trigger build tests.
+        :param python_test_paths: A set of python test paths to be discovered
+        :param python_excluded_tests: A set of excluded Python tests
         """
         self.name = name
         self.dependencies = dependencies
@@ -65,7 +85,10 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.sbt_test_goals = sbt_test_goals
         self.build_profile_flags = build_profile_flags
         self.environ = environ or {}
-        self.python_test_goals = python_test_goals
+        discovered_goals = _discover_python_unittests(python_test_paths)
+        # Final goals = Manual specified goals + Discoverd goals - Excluded goals
+        all_goals = set(python_test_goals) | set(discovered_goals)
+        self.python_test_goals = sorted(list(set(all_goals) - set(python_excluded_tests)))
         self.excluded_python_implementations = excluded_python_implementations
         self.test_tags = test_tags
         self.should_run_r_tests = should_run_r_tests
@@ -382,24 +405,10 @@ def __hash__(self):
         "pyspark.profiler",
         "pyspark.shuffle",
         "pyspark.util",
-        # unittests
-        "pyspark.tests.test_appsubmit",
-        "pyspark.tests.test_broadcast",
-        "pyspark.tests.test_conf",
-        "pyspark.tests.test_context",
-        "pyspark.tests.test_daemon",
-        "pyspark.tests.test_install_spark",
-        "pyspark.tests.test_join",
-        "pyspark.tests.test_profiler",
-        "pyspark.tests.test_rdd",
-        "pyspark.tests.test_rddbarrier",
-        "pyspark.tests.test_readwrite",
-        "pyspark.tests.test_serializers",
-        "pyspark.tests.test_shuffle",
-        "pyspark.tests.test_taskcontext",
-        "pyspark.tests.test_util",
-        "pyspark.tests.test_worker",
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/tests"
+    ],
 )
 
 pyspark_sql = Module(
@@ -431,32 +440,10 @@ def __hash__(self):
         "pyspark.sql.pandas.serializers",
         "pyspark.sql.pandas.typehints",
         "pyspark.sql.pandas.utils",
-        # unittests
-        "pyspark.sql.tests.test_arrow",
-        "pyspark.sql.tests.test_catalog",
-        "pyspark.sql.tests.test_column",
-        "pyspark.sql.tests.test_conf",
-        "pyspark.sql.tests.test_context",
-        "pyspark.sql.tests.test_dataframe",
-        "pyspark.sql.tests.test_datasources",
-        "pyspark.sql.tests.test_functions",
-        "pyspark.sql.tests.test_group",
-        "pyspark.sql.tests.test_pandas_cogrouped_map",
-        "pyspark.sql.tests.test_pandas_grouped_map",
-        "pyspark.sql.tests.test_pandas_map",
-        "pyspark.sql.tests.test_pandas_udf",
-        "pyspark.sql.tests.test_pandas_udf_grouped_agg",
-        "pyspark.sql.tests.test_pandas_udf_scalar",
-        "pyspark.sql.tests.test_pandas_udf_typehints",
-        "pyspark.sql.tests.test_pandas_udf_window",
-        "pyspark.sql.tests.test_readwriter",
-        "pyspark.sql.tests.test_serde",
-        "pyspark.sql.tests.test_session",
-        "pyspark.sql.tests.test_streaming",
-        "pyspark.sql.tests.test_types",
-        "pyspark.sql.tests.test_udf",
-        "pyspark.sql.tests.test_utils",
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/sql/tests"
+    ],
 )
 
 
@@ -471,7 +458,10 @@ def __hash__(self):
     python_test_goals=[
         # unittests
         "pyspark.resource.tests.test_resources",
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/resource/tests"
+    ],
 )
 
 
@@ -488,12 +478,10 @@ def __hash__(self):
     python_test_goals=[
         # doctests
         "pyspark.streaming.util",
-        # unittests
-        "pyspark.streaming.tests.test_context",
-        "pyspark.streaming.tests.test_dstream",
-        "pyspark.streaming.tests.test_kinesis",
-        "pyspark.streaming.tests.test_listener",
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/streaming/tests"
+    ],
 )
 
 
@@ -519,17 +507,13 @@ def __hash__(self):
         "pyspark.mllib.stat.KernelDensity",
         "pyspark.mllib.tree",
         "pyspark.mllib.util",
-        # unittests
-        "pyspark.mllib.tests.test_algorithms",
-        "pyspark.mllib.tests.test_feature",
-        "pyspark.mllib.tests.test_linalg",
-        "pyspark.mllib.tests.test_stat",
-        "pyspark.mllib.tests.test_streaming_algorithms",
-        "pyspark.mllib.tests.test_util",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/mllib/tests"
+    ],
 )
 
 
@@ -553,27 +537,28 @@ def __hash__(self):
         "pyspark.ml.regression",
         "pyspark.ml.stat",
         "pyspark.ml.tuning",
-        # unittests
-        "pyspark.ml.tests.test_algorithms",
-        "pyspark.ml.tests.test_base",
-        "pyspark.ml.tests.test_evaluation",
-        "pyspark.ml.tests.test_feature",
-        "pyspark.ml.tests.test_image",
-        "pyspark.ml.tests.test_linalg",
-        "pyspark.ml.tests.test_param",
-        "pyspark.ml.tests.test_persistence",
-        "pyspark.ml.tests.test_pipeline",
-        "pyspark.ml.tests.test_stat",
-        "pyspark.ml.tests.test_training_summary",
-        "pyspark.ml.tests.test_tuning",
-        "pyspark.ml.tests.test_util",
-        "pyspark.ml.tests.test_wrapper",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
-    ]
+    ],
+    python_test_paths=[
+        "pyspark/ml/tests"
+    ],
 )
 
+pyspark_pandas_slow_unittests = [
+    # unittests
+    "pyspark.pandas.tests.indexes.test_base",
+    "pyspark.pandas.tests.indexes.test_datetime",
+    "pyspark.pandas.tests.test_dataframe",
+    "pyspark.pandas.tests.test_groupby",
+    "pyspark.pandas.tests.test_indexing",
+    "pyspark.pandas.tests.test_ops_on_diff_frames",
+    "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
+    "pyspark.pandas.tests.test_series",
+    "pyspark.pandas.tests.test_stats",
+]
+
 pyspark_pandas = Module(
     name="pyspark-pandas",
     dependencies=[pyspark_core, pyspark_sql],
@@ -608,54 +593,18 @@ def __hash__(self):
         "pyspark.pandas.spark.accessors",
         "pyspark.pandas.spark.utils",
         "pyspark.pandas.typedef.typehints",
-        # unittests
-        "pyspark.pandas.tests.data_type_ops.test_binary_ops",
-        "pyspark.pandas.tests.data_type_ops.test_boolean_ops",
-        "pyspark.pandas.tests.data_type_ops.test_categorical_ops",
-        "pyspark.pandas.tests.data_type_ops.test_complex_ops",
-        "pyspark.pandas.tests.data_type_ops.test_date_ops",
-        "pyspark.pandas.tests.data_type_ops.test_datetime_ops",
-        "pyspark.pandas.tests.data_type_ops.test_null_ops",
-        "pyspark.pandas.tests.data_type_ops.test_num_ops",
-        "pyspark.pandas.tests.data_type_ops.test_string_ops",
-        "pyspark.pandas.tests.data_type_ops.test_udt_ops",
-        "pyspark.pandas.tests.indexes.test_category",
-        "pyspark.pandas.tests.plot.test_frame_plot",
-        "pyspark.pandas.tests.plot.test_frame_plot_matplotlib",
-        "pyspark.pandas.tests.plot.test_frame_plot_plotly",
-        "pyspark.pandas.tests.plot.test_series_plot",
-        "pyspark.pandas.tests.plot.test_series_plot_matplotlib",
-        "pyspark.pandas.tests.plot.test_series_plot_plotly",
-        "pyspark.pandas.tests.test_categorical",
-        "pyspark.pandas.tests.test_config",
-        "pyspark.pandas.tests.test_csv",
-        "pyspark.pandas.tests.test_dataframe_conversion",
-        "pyspark.pandas.tests.test_dataframe_spark_io",
-        "pyspark.pandas.tests.test_default_index",
-        "pyspark.pandas.tests.test_expanding",
-        "pyspark.pandas.tests.test_extension",
-        "pyspark.pandas.tests.test_frame_spark",
-        "pyspark.pandas.tests.test_indexops_spark",
-        "pyspark.pandas.tests.test_internal",
-        "pyspark.pandas.tests.test_namespace",
-        "pyspark.pandas.tests.test_numpy_compat",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_expanding",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby_rolling",
-        "pyspark.pandas.tests.test_repr",
-        "pyspark.pandas.tests.test_reshape",
-        "pyspark.pandas.tests.test_rolling",
-        "pyspark.pandas.tests.test_series_conversion",
-        "pyspark.pandas.tests.test_series_datetime",
-        "pyspark.pandas.tests.test_series_string",
-        "pyspark.pandas.tests.test_sql",
-        "pyspark.pandas.tests.test_typedef",
-        "pyspark.pandas.tests.test_utils",
-        "pyspark.pandas.tests.test_window",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
-                # they aren't available there
-    ]
+        # they aren't available there
+    ],
+    python_test_paths=[
+        "pyspark/pandas/tests",
+        "pyspark/pandas/tests/data_type_ops",
+        "pyspark/pandas/tests/indexes",
+        "pyspark/pandas/tests/plot",
+    ],
+    python_excluded_tests=pyspark_pandas_slow_unittests,
 )
 
 pyspark_pandas_slow = Module(
@@ -669,17 +618,7 @@ def __hash__(self):
         "pyspark.pandas.frame",
         "pyspark.pandas.generic",
         "pyspark.pandas.series",
-        # unittests
-        "pyspark.pandas.tests.indexes.test_base",
-        "pyspark.pandas.tests.indexes.test_datetime",
-        "pyspark.pandas.tests.test_dataframe",
-        "pyspark.pandas.tests.test_groupby",
-        "pyspark.pandas.tests.test_indexing",
-        "pyspark.pandas.tests.test_ops_on_diff_frames",
-        "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
-        "pyspark.pandas.tests.test_series",
-        "pyspark.pandas.tests.test_stats",
-    ],
+    ] + pyspark_pandas_slow_unittests,
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
         # they aren't available there

From 23f2c87c7ee16d433b3d59dbb71dae00203fea97 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Tue, 15 Jun 2021 10:28:41 +0800
Subject: [PATCH 2/8] Change glob rule from test* to test_*

---
 dev/sparktestsupport/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index b2f5c346be8ab..2b4d72f49b130 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -33,7 +33,7 @@ def _discover_python_unittests(paths):
     pyspark_path = os.path.join(SPARK_HOME, "python")
     for path in paths:
         # Discover the test*.py in every path
-        files = glob.glob(os.path.join(pyspark_path, path, "test*.py"))
+        files = glob.glob(os.path.join(pyspark_path, path, "test_*.py"))
         for f in files:
             # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc'
             file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".")

From 22015a526ae9fe66482d477fbc858ec6d73d18b4 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Wed, 16 Jun 2021 14:45:55 +0800
Subject: [PATCH 3/8] Change the 'test_* name discover' to 'unittest module
 discover'

---
 dev/sparktestsupport/modules.py | 54 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 2b4d72f49b130..170c0e20389c6 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -16,28 +16,61 @@
 #
 
 from functools import total_ordering
+from importlib import import_module
+import inspect
 import itertools
 import os
+from pkgutil import iter_modules
 import re
-import glob
+import sys
+import unittest
 
 from sparktestsupport import SPARK_HOME
 
+
 all_modules = []
+pyspark_path = os.path.join(SPARK_HOME, "python")
+sys.path.append(pyspark_path)
+
+
+def _contain_unittests_class(module_name):
+    """
+    Check if the module with specific module_name has classes are derived from unittest.TestCase.
+    Such as:
+    pyspark.tests.test_appsubmit, it will return True, because there is SparkSubmitTests which is
+    included under the module of pyspark.tests.test_appsubmit, inherits from unittest.TestCase.
+    ``
+    :param module_name: the complete name of module to be checked.
+    :return: True if contains unittest classes otherwise False.
+             An ``ModuleNotFoundError`` will raise if the module is not found
+    """
+    _module = import_module(module_name)
+    for _, _class in inspect.getmembers(_module, inspect.isclass):
+        if issubclass(_class, unittest.TestCase):
+            return True
+    return False
 
 
 def _discover_python_unittests(paths):
+    """
+    Discover the python module which contains unittests under paths.
+    Such as:
+    ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
+    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
+    :param paths: paths of module to be discovered.
+    :return: A set of complete test module name discovered udner the paths
+    """
     if not paths:
-        return set([])
-    tests = set([])
-    pyspark_path = os.path.join(SPARK_HOME, "python")
+        return set()
+    tests = set()
+
     for path in paths:
-        # Discover the test*.py in every path
-        files = glob.glob(os.path.join(pyspark_path, path, "test_*.py"))
-        for f in files:
-            # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc'
-            file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".")
-            tests.add(file2module)
+        real_path = os.path.join(pyspark_path, path)
+        _prefix = path.replace('/', '.')
+        # iter modules under the specific tests path
+        for module in iter_modules([real_path], prefix=_prefix+'.'):
+            if _contain_unittests_class(module.name):
+                tests.add(module.name)
     return tests
 
 
@@ -56,7 +89,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
                  should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()):
         """
         Define a new module.
-
         :param name: A short module name, for display in logging and error messages.
         :param dependencies: A set of dependencies for this module. This should only include direct
             dependencies; transitive dependencies are resolved automatically.

From 7cda24e3460cd9270fcdf3f866121ff9a69506dc Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Wed, 16 Jun 2021 18:27:48 +0800
Subject: [PATCH 4/8] Revert to test_* discover

---
 dev/sparktestsupport/modules.py | 53 +++++++--------------------------
 1 file changed, 10 insertions(+), 43 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 170c0e20389c6..5b6bbf864dfa5 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -16,61 +16,28 @@
 #
 
 from functools import total_ordering
-from importlib import import_module
-import inspect
 import itertools
 import os
-from pkgutil import iter_modules
 import re
-import sys
-import unittest
+import glob
 
 from sparktestsupport import SPARK_HOME
 
-
 all_modules = []
-pyspark_path = os.path.join(SPARK_HOME, "python")
-sys.path.append(pyspark_path)
-
-
-def _contain_unittests_class(module_name):
-    """
-    Check if the module with specific module_name has classes are derived from unittest.TestCase.
-    Such as:
-    pyspark.tests.test_appsubmit, it will return True, because there is SparkSubmitTests which is
-    included under the module of pyspark.tests.test_appsubmit, inherits from unittest.TestCase.
-    ``
-    :param module_name: the complete name of module to be checked.
-    :return: True if contains unittest classes otherwise False.
-             An ``ModuleNotFoundError`` will raise if the module is not found
-    """
-    _module = import_module(module_name)
-    for _, _class in inspect.getmembers(_module, inspect.isclass):
-        if issubclass(_class, unittest.TestCase):
-            return True
-    return False
 
 
 def _discover_python_unittests(paths):
-    """
-    Discover the python module which contains unittests under paths.
-    Such as:
-    ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
-    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
-    :param paths: paths of module to be discovered.
-    :return: A set of complete test module name discovered udner the paths
-    """
     if not paths:
-        return set()
-    tests = set()
-
+        return set([])
+    tests = set([])
+    pyspark_path = os.path.join(SPARK_HOME, "python")
     for path in paths:
-        real_path = os.path.join(pyspark_path, path)
-        _prefix = path.replace('/', '.')
-        # iter modules under the specific tests path
-        for module in iter_modules([real_path], prefix=_prefix+'.'):
-            if _contain_unittests_class(module.name):
-                tests.add(module.name)
+        # Discover the test*.py in every path
+        files = glob.glob(os.path.join(pyspark_path, path, "test_*.py"))
+        for f in files:
+            # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc'
+            file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".")
+            tests.add(file2module)
     return tests
 
 

From 14ee8761ed79dc8fca8d02f594c9f7167b9ed523 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Thu, 17 Jun 2021 11:20:47 +0800
Subject: [PATCH 5/8] Change 'test_* discover' to 'unittest test loader
 discover'

---
 dev/sparktestsupport/modules.py | 40 ++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 5b6bbf864dfa5..a5bc5d0c851ba 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -19,7 +19,7 @@
 import itertools
 import os
 import re
-import glob
+import unittest
 
 from sparktestsupport import SPARK_HOME
 
@@ -27,18 +27,38 @@
 
 
 def _discover_python_unittests(paths):
+    """
+    Discover the python module which contains unittests under paths.
+
+    Such as:
+    ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
+    {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
+
+    :param paths: paths of module to be discovered.
+    :return: A set of complete test module name discovered udner the paths
+    """
+
+    def add_suite(suite, modules):
+        """Gather the suite module names"""
+        if hasattr(suite, '__iter__'):
+            for test_case in suite:
+                add_suite(test_case, modules)
+        else:
+            modules.add(suite.__module__)
+
     if not paths:
         return set([])
-    tests = set([])
+    modules = set([])
     pyspark_path = os.path.join(SPARK_HOME, "python")
     for path in paths:
-        # Discover the test*.py in every path
-        files = glob.glob(os.path.join(pyspark_path, path, "test_*.py"))
-        for f in files:
-            # Convert 'pyspark_path/pyspark/tests/test_abc.py' to 'pyspark.tests.test_abc'
-            file2module = os.path.relpath(f, pyspark_path)[:-3].replace("/", ".")
-            tests.add(file2module)
-    return tests
+        # Discover the unittest in every path
+        suite = unittest.defaultTestLoader.discover(
+            os.path.join(pyspark_path, path),
+            top_level_dir=pyspark_path
+        )
+        add_suite(suite, modules)
+
+    return modules
 
 
 @total_ordering
@@ -455,8 +475,6 @@ def __hash__(self):
         "python/pyspark/resource"
     ],
     python_test_goals=[
-        # unittests
-        "pyspark.resource.tests.test_resources",
     ],
     python_test_paths=[
         "pyspark/resource/tests"

From b5871b06f23bc9abb4f83163bafc2f8e88101def Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Fri, 18 Jun 2021 23:19:22 +0800
Subject: [PATCH 6/8] Add _slow_test flag and address nits

---
 dev/sparktestsupport/modules.py               | 123 ++++++++----------
 .../pyspark/pandas/tests/indexes/test_base.py |   5 +
 .../pandas/tests/indexes/test_datetime.py     |   5 +
 python/pyspark/pandas/tests/test_dataframe.py |   5 +
 python/pyspark/pandas/tests/test_groupby.py   |   5 +
 python/pyspark/pandas/tests/test_indexing.py  |   5 +
 .../pandas/tests/test_ops_on_diff_frames.py   |   5 +
 .../tests/test_ops_on_diff_frames_groupby.py  |   5 +
 python/pyspark/pandas/tests/test_series.py    |   5 +
 python/pyspark/pandas/tests/test_stats.py     |   5 +
 10 files changed, 101 insertions(+), 67 deletions(-)

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index a5bc5d0c851ba..49b84716d07e2 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -20,35 +20,54 @@
 import os
 import re
 import unittest
+import sys
 
 from sparktestsupport import SPARK_HOME
 
 all_modules = []
 
 
-def _discover_python_unittests(paths):
-    """
-    Discover the python module which contains unittests under paths.
+def _get_module_from_name(name):
+    __import__(name)
+    return sys.modules[name]
+
+
+def _discover_python_unittests(*paths, discover_slow=False):
+    """Discover the python module which contains unittests under paths.
 
     Such as:
     ['pyspark/tests'], it will return the set of module name under the path of pyspark/tests, like
     {'pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', ...}
 
-    :param paths: paths of module to be discovered.
-    :return: A set of complete test module name discovered udner the paths
+    Parameters
+    ----------
+    paths : str
+        Paths of modules to be discovered.
+    discover_slow : bool
+        If True, will only discover slow tests
+        If False, will discover all tests except slow tests
+
+    Returns
+    -------
+    A set of complete test module name discovered under specified paths
     """
 
-    def add_suite(suite, modules):
+    def add_suite(testcases, modules, slow):
         """Gather the suite module names"""
-        if hasattr(suite, '__iter__'):
-            for test_case in suite:
-                add_suite(test_case, modules)
+        if hasattr(testcases, '__iter__'):
+            for test_case in testcases:
+                add_suite(test_case, modules, slow)
         else:
-            modules.add(suite.__module__)
+            name = testcases.__module__
+            module = _get_module_from_name(name)
+            if slow and hasattr(module, '_slow_test'):
+                modules.add(name)
+            if not slow and not hasattr(module, '_slow_test'):
+                modules.add(name)
 
     if not paths:
-        return set([])
-    modules = set([])
+        return []
+    _modules = set()
     pyspark_path = os.path.join(SPARK_HOME, "python")
     for path in paths:
         # Discover the unittest in every path
@@ -56,9 +75,9 @@ def add_suite(suite, modules):
             os.path.join(pyspark_path, path),
             top_level_dir=pyspark_path
         )
-        add_suite(suite, modules)
+        add_suite(suite, _modules, discover_slow)
 
-    return modules
+    return sorted(list(_modules))
 
 
 @total_ordering
@@ -73,9 +92,10 @@ class Module(object):
     def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=(),
                  environ=None, sbt_test_goals=(), python_test_goals=(),
                  excluded_python_implementations=(), test_tags=(), should_run_r_tests=False,
-                 should_run_build_tests=False, python_test_paths=(), python_excluded_tests=()):
+                 should_run_build_tests=False):
         """
         Define a new module.
+
         :param name: A short module name, for display in logging and error messages.
         :param dependencies: A set of dependencies for this module. This should only include direct
             dependencies; transitive dependencies are resolved automatically.
@@ -95,8 +115,6 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
             is not explicitly changed.
         :param should_run_r_tests: If true, changes in this module will trigger all R tests.
         :param should_run_build_tests: If true, changes in this module will trigger build tests.
-        :param python_test_paths: A set of python test paths to be discovered
-        :param python_excluded_tests: A set of excluded Python tests
         """
         self.name = name
         self.dependencies = dependencies
@@ -104,10 +122,7 @@ def __init__(self, name, dependencies, source_file_regexes, build_profile_flags=
         self.sbt_test_goals = sbt_test_goals
         self.build_profile_flags = build_profile_flags
         self.environ = environ or {}
-        discovered_goals = _discover_python_unittests(python_test_paths)
-        # Final goals = Manual specified goals + Discoverd goals - Excluded goals
-        all_goals = set(python_test_goals) | set(discovered_goals)
-        self.python_test_goals = sorted(list(set(all_goals) - set(python_excluded_tests)))
+        self.python_test_goals = python_test_goals
         self.excluded_python_implementations = excluded_python_implementations
         self.test_tags = test_tags
         self.should_run_r_tests = should_run_r_tests
@@ -424,10 +439,7 @@ def __hash__(self):
         "pyspark.profiler",
         "pyspark.shuffle",
         "pyspark.util",
-    ],
-    python_test_paths=[
-        "pyspark/tests"
-    ],
+    ] + _discover_python_unittests("pyspark/tests"),
 )
 
 pyspark_sql = Module(
@@ -459,10 +471,7 @@ def __hash__(self):
         "pyspark.sql.pandas.serializers",
         "pyspark.sql.pandas.typehints",
         "pyspark.sql.pandas.utils",
-    ],
-    python_test_paths=[
-        "pyspark/sql/tests"
-    ],
+    ] + _discover_python_unittests("pyspark/sql/tests"),
 )
 
 
@@ -474,11 +483,7 @@ def __hash__(self):
     source_file_regexes=[
         "python/pyspark/resource"
     ],
-    python_test_goals=[
-    ],
-    python_test_paths=[
-        "pyspark/resource/tests"
-    ],
+    python_test_goals=_discover_python_unittests("pyspark/resource/tests"),
 )
 
 
@@ -495,10 +500,7 @@ def __hash__(self):
     python_test_goals=[
         # doctests
         "pyspark.streaming.util",
-    ],
-    python_test_paths=[
-        "pyspark/streaming/tests"
-    ],
+    ] + _discover_python_unittests("pyspark/streaming/tests"),
 )
 
 
@@ -524,13 +526,10 @@ def __hash__(self):
         "pyspark.mllib.stat.KernelDensity",
         "pyspark.mllib.tree",
         "pyspark.mllib.util",
-    ],
+    ] + _discover_python_unittests("pyspark/mllib/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
     ],
-    python_test_paths=[
-        "pyspark/mllib/tests"
-    ],
 )
 
 
@@ -554,27 +553,12 @@ def __hash__(self):
         "pyspark.ml.regression",
         "pyspark.ml.stat",
         "pyspark.ml.tuning",
-    ],
+    ] + _discover_python_unittests("pyspark/ml/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy and it isn't available there
     ],
-    python_test_paths=[
-        "pyspark/ml/tests"
-    ],
 )
 
-pyspark_pandas_slow_unittests = [
-    # unittests
-    "pyspark.pandas.tests.indexes.test_base",
-    "pyspark.pandas.tests.indexes.test_datetime",
-    "pyspark.pandas.tests.test_dataframe",
-    "pyspark.pandas.tests.test_groupby",
-    "pyspark.pandas.tests.test_indexing",
-    "pyspark.pandas.tests.test_ops_on_diff_frames",
-    "pyspark.pandas.tests.test_ops_on_diff_frames_groupby",
-    "pyspark.pandas.tests.test_series",
-    "pyspark.pandas.tests.test_stats",
-]
 
 pyspark_pandas = Module(
     name="pyspark-pandas",
@@ -610,20 +594,19 @@ def __hash__(self):
         "pyspark.pandas.spark.accessors",
         "pyspark.pandas.spark.utils",
         "pyspark.pandas.typedef.typehints",
-    ],
-    excluded_python_implementations=[
-        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
-        # they aren't available there
-    ],
-    python_test_paths=[
+    ] + _discover_python_unittests(
         "pyspark/pandas/tests",
         "pyspark/pandas/tests/data_type_ops",
         "pyspark/pandas/tests/indexes",
-        "pyspark/pandas/tests/plot",
+        "pyspark/pandas/tests/plot"
+    ),
+    excluded_python_implementations=[
+        "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
+        # they aren't available there
     ],
-    python_excluded_tests=pyspark_pandas_slow_unittests,
 )
 
+
 pyspark_pandas_slow = Module(
     name="pyspark-pandas-slow",
     dependencies=[pyspark_core, pyspark_sql],
@@ -635,7 +618,13 @@ def __hash__(self):
         "pyspark.pandas.frame",
         "pyspark.pandas.generic",
         "pyspark.pandas.series",
-    ] + pyspark_pandas_slow_unittests,
+    ] + _discover_python_unittests(
+        "pyspark/pandas/tests",
+        "pyspark/pandas/tests/data_type_ops",
+        "pyspark/pandas/tests/indexes",
+        "pyspark/pandas/tests/plot",
+        discover_slow=True
+    ),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
         # they aren't available there
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index d7c0ace911b35..9dc10ecb8b64f 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -34,6 +34,11 @@
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils, SPARK_CONF_ARROW_ENABLED
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class IndexesTest(PandasOnSparkTestCase, TestUtils):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
index 8a55e2efefdb1..e3e9b8f4cb885 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -25,6 +25,11 @@
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
     @property
     def fixed_freqs(self):
diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
index e54b7835c22ca..b42462262a982 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -50,6 +50,11 @@
 from pyspark.pandas.utils import name_like_string
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py
index 1bc182d8462f0..a88f2195943da 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -34,6 +34,11 @@
 from pyspark.testing.pandasutils import PandasOnSparkTestCase, TestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class GroupByTest(PandasOnSparkTestCase, TestUtils):
     def test_groupby_simple(self):
         pdf = pd.DataFrame(
diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py
index b74cf90d079f9..dc0e8abe00307 100644
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@@ -27,6 +27,11 @@
 from pyspark.testing.pandasutils import ComparisonTestBase, PandasOnSparkTestCase, compare_both
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class BasicIndexingTest(ComparisonTestBase):
     @property
     def pdf(self):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 12e87b202e699..5200a00d86240 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -35,6 +35,11 @@
 )
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
     def setUpClass(cls):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
index 70c3089c7025e..c14e1d74829ac 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@@ -25,6 +25,11 @@
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
     @classmethod
     def setUpClass(cls):
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
index 5fea71d6622bd..0ed2255440c01 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -44,6 +44,11 @@
 )
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
     @property
     def pser(self):
diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py
index 1a386654076e9..7048dcf949ff5 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@@ -31,6 +31,11 @@
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
+# This is used in run-tests.py to discover the slow test. See more in the doc of
+# _discover_python_unittests of dev/sparktestsupport/modules.py
+_slow_test = True
+
+
 class StatsTest(PandasOnSparkTestCase, SQLTestUtils):
     def _test_stat_functions(self, pdf_or_pser, psdf_or_psser):
         functions = ["max", "min", "mean", "sum", "count"]

From ef460a70716b3a4b4c9835b8d7fd73f839500e24 Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Mon, 28 Jun 2021 15:10:10 +0800
Subject: [PATCH 7/8] add doctest and fix nits

---
 dev/run-tests.py                              | 10 ++++-
 dev/sparktestsupport/modules.py               | 45 +++++++++----------
 .../pyspark/pandas/tests/indexes/test_base.py |  2 +-
 .../pandas/tests/indexes/test_datetime.py     |  2 +-
 python/pyspark/pandas/tests/test_dataframe.py |  2 +-
 python/pyspark/pandas/tests/test_groupby.py   |  2 +-
 python/pyspark/pandas/tests/test_indexing.py  |  2 +-
 .../pandas/tests/test_ops_on_diff_frames.py   |  2 +-
 .../tests/test_ops_on_diff_frames_groupby.py  |  2 +-
 python/pyspark/pandas/tests/test_series.py    |  2 +-
 python/pyspark/pandas/tests/test_stats.py     |  2 +-
 11 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index c3b074533c9f8..a440754263e2d 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -19,6 +19,7 @@
 
 import itertools
 from argparse import ArgumentParser
+import doctest
 import os
 import re
 import sys
@@ -774,12 +775,19 @@ def main():
 
 
 def _test():
-    import doctest
     failure_count = doctest.testmod()[0]
     if failure_count:
         sys.exit(-1)
 
 
+def _modules_doctest():
+    # Running doctests in sparktestsupport.modules
+    failure_count = doctest.testmod(m=modules)[0]
+    if failure_count:
+        sys.exit(-1)
+
+
 if __name__ == "__main__":
     _test()
+    _modules_doctest()
     main()
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 49b84716d07e2..a455d1b184249 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from collections.abc import Iterable
 from functools import total_ordering
 import itertools
 import os
@@ -50,34 +51,43 @@ def _discover_python_unittests(*paths, discover_slow=False):
     Returns
     -------
     A set of complete test module name discovered under specified paths
+
+    >>> sorted([x for x in _discover_python_unittests('pyspark/tests')])
+    ... # doctest: +NORMALIZE_WHITESPACE
+    ['pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', 'pyspark.tests.test_conf',
+    'pyspark.tests.test_context', 'pyspark.tests.test_daemon', 'pyspark.tests.test_install_spark',
+    'pyspark.tests.test_join', 'pyspark.tests.test_pin_thread', 'pyspark.tests.test_profiler',
+    'pyspark.tests.test_rdd', 'pyspark.tests.test_rddbarrier', 'pyspark.tests.test_readwrite',
+    'pyspark.tests.test_serializers', 'pyspark.tests.test_shuffle',
+    'pyspark.tests.test_taskcontext', 'pyspark.tests.test_util', 'pyspark.tests.test_worker']
     """
 
-    def add_suite(testcases, modules, slow):
-        """Gather the suite module names"""
-        if hasattr(testcases, '__iter__'):
+    def add_test_module(testcases, modules, slow):
+        """Append the testcases module names to modules set"""
+        if isinstance(testcases, Iterable):
             for test_case in testcases:
-                add_suite(test_case, modules, slow)
+                add_test_module(test_case, modules, slow)
         else:
             name = testcases.__module__
             module = _get_module_from_name(name)
-            if slow and hasattr(module, '_slow_test'):
+            if slow and hasattr(module, 'is_slow_test'):
                 modules.add(name)
-            if not slow and not hasattr(module, '_slow_test'):
+            if not slow and not hasattr(module, 'is_slow_test'):
                 modules.add(name)
 
     if not paths:
         return []
-    _modules = set()
+    modules = set()
     pyspark_path = os.path.join(SPARK_HOME, "python")
     for path in paths:
         # Discover the unittest in every path
-        suite = unittest.defaultTestLoader.discover(
+        testcases = unittest.defaultTestLoader.discover(
             os.path.join(pyspark_path, path),
             top_level_dir=pyspark_path
         )
-        add_suite(suite, _modules, discover_slow)
+        add_test_module(testcases, modules, discover_slow)
 
-    return sorted(list(_modules))
+    return sorted(list(modules))
 
 
 @total_ordering
@@ -594,12 +604,7 @@ def __hash__(self):
         "pyspark.pandas.spark.accessors",
         "pyspark.pandas.spark.utils",
         "pyspark.pandas.typedef.typehints",
-    ] + _discover_python_unittests(
-        "pyspark/pandas/tests",
-        "pyspark/pandas/tests/data_type_ops",
-        "pyspark/pandas/tests/indexes",
-        "pyspark/pandas/tests/plot"
-    ),
+    ] + _discover_python_unittests("pyspark/pandas/tests"),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
         # they aren't available there
@@ -618,13 +623,7 @@ def __hash__(self):
         "pyspark.pandas.frame",
         "pyspark.pandas.generic",
         "pyspark.pandas.series",
-    ] + _discover_python_unittests(
-        "pyspark/pandas/tests",
-        "pyspark/pandas/tests/data_type_ops",
-        "pyspark/pandas/tests/indexes",
-        "pyspark/pandas/tests/plot",
-        discover_slow=True
-    ),
+    ] + _discover_python_unittests("pyspark/pandas/tests", discover_slow=True),
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
         # they aren't available there
diff --git a/python/pyspark/pandas/tests/indexes/test_base.py b/python/pyspark/pandas/tests/indexes/test_base.py
index c7667caea2ab5..2faad5effff0f 100644
--- a/python/pyspark/pandas/tests/indexes/test_base.py
+++ b/python/pyspark/pandas/tests/indexes/test_base.py
@@ -36,7 +36,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class IndexesTest(PandasOnSparkTestCase, TestUtils):
diff --git a/python/pyspark/pandas/tests/indexes/test_datetime.py b/python/pyspark/pandas/tests/indexes/test_datetime.py
index e3e9b8f4cb885..79340127e3ebb 100644
--- a/python/pyspark/pandas/tests/indexes/test_datetime.py
+++ b/python/pyspark/pandas/tests/indexes/test_datetime.py
@@ -27,7 +27,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class DatetimeIndexTest(PandasOnSparkTestCase, TestUtils):
diff --git a/python/pyspark/pandas/tests/test_dataframe.py b/python/pyspark/pandas/tests/test_dataframe.py
index b42462262a982..858863cac4abf 100644
--- a/python/pyspark/pandas/tests/test_dataframe.py
+++ b/python/pyspark/pandas/tests/test_dataframe.py
@@ -52,7 +52,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class DataFrameTest(PandasOnSparkTestCase, SQLTestUtils):
diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py
index a88f2195943da..5ec93ca3d9772 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -36,7 +36,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class GroupByTest(PandasOnSparkTestCase, TestUtils):
diff --git a/python/pyspark/pandas/tests/test_indexing.py b/python/pyspark/pandas/tests/test_indexing.py
index dc0e8abe00307..056d404bc161b 100644
--- a/python/pyspark/pandas/tests/test_indexing.py
+++ b/python/pyspark/pandas/tests/test_indexing.py
@@ -29,7 +29,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class BasicIndexingTest(ComparisonTestBase):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
index 5200a00d86240..db8beb7a23470 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames.py
@@ -37,7 +37,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class OpsOnDiffFramesEnabledTest(PandasOnSparkTestCase, SQLTestUtils):
diff --git a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
index c14e1d74829ac..97efcf82257b7 100644
--- a/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
+++ b/python/pyspark/pandas/tests/test_ops_on_diff_frames_groupby.py
@@ -27,7 +27,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class OpsOnDiffFramesGroupByTest(PandasOnSparkTestCase, SQLTestUtils):
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
index 0ed2255440c01..d0aa66fd57c05 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -46,7 +46,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class SeriesTest(PandasOnSparkTestCase, SQLTestUtils):
diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py
index 7048dcf949ff5..06b35e042f3b0 100644
--- a/python/pyspark/pandas/tests/test_stats.py
+++ b/python/pyspark/pandas/tests/test_stats.py
@@ -33,7 +33,7 @@
 
 # This is used in run-tests.py to discover the slow test. See more in the doc of
 # _discover_python_unittests of dev/sparktestsupport/modules.py
-_slow_test = True
+is_slow_test = True
 
 
 class StatsTest(PandasOnSparkTestCase, SQLTestUtils):

From 64e0c0a6da6bc34d476b36a5c80d93a56571a7cd Mon Sep 17 00:00:00 2001
From: Yikun Jiang <yikunkero@gmail.com>
Date: Mon, 28 Jun 2021 21:53:21 +0800
Subject: [PATCH 8/8] Remove doctest

---
 dev/run-tests.py                | 10 +---------
 dev/sparktestsupport/modules.py |  9 ---------
 2 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/dev/run-tests.py b/dev/run-tests.py
index a440754263e2d..c3b074533c9f8 100755
--- a/dev/run-tests.py
+++ b/dev/run-tests.py
@@ -19,7 +19,6 @@
 
 import itertools
 from argparse import ArgumentParser
-import doctest
 import os
 import re
 import sys
@@ -775,19 +774,12 @@ def main():
 
 
 def _test():
+    import doctest
     failure_count = doctest.testmod()[0]
     if failure_count:
         sys.exit(-1)
 
 
-def _modules_doctest():
-    # Running doctests in sparktestsupport.modules
-    failure_count = doctest.testmod(m=modules)[0]
-    if failure_count:
-        sys.exit(-1)
-
-
 if __name__ == "__main__":
     _test()
-    _modules_doctest()
     main()
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index a455d1b184249..ac02ae44c1ccd 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -51,15 +51,6 @@ def _discover_python_unittests(*paths, discover_slow=False):
     Returns
     -------
     A set of complete test module name discovered under specified paths
-
-    >>> sorted([x for x in _discover_python_unittests('pyspark/tests')])
-    ... # doctest: +NORMALIZE_WHITESPACE
-    ['pyspark.tests.test_appsubmit', 'pyspark.tests.test_broadcast', 'pyspark.tests.test_conf',
-    'pyspark.tests.test_context', 'pyspark.tests.test_daemon', 'pyspark.tests.test_install_spark',
-    'pyspark.tests.test_join', 'pyspark.tests.test_pin_thread', 'pyspark.tests.test_profiler',
-    'pyspark.tests.test_rdd', 'pyspark.tests.test_rddbarrier', 'pyspark.tests.test_readwrite',
-    'pyspark.tests.test_serializers', 'pyspark.tests.test_shuffle',
-    'pyspark.tests.test_taskcontext', 'pyspark.tests.test_util', 'pyspark.tests.test_worker']
     """
 
     def add_test_module(testcases, modules, slow):