From 21e224a1d94c65814e553935f88ca65cb3a2ee69 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 3 Jan 2017 16:33:51 -0800 Subject: [PATCH 1/2] Fix pip installing of sub components, and update the test and make-distribution scripts to be more explicit about cleanup. Also add pypandoc to dev requirements file since we want it for publishing --- dev/make-distribution.sh | 2 ++ dev/pip-sanity-check.py | 1 + dev/requirements.txt | 1 + dev/run-pip-tests | 6 ++++-- python/setup.py | 3 +++ 5 files changed, 11 insertions(+), 2 deletions(-) diff --git a/dev/make-distribution.sh b/dev/make-distribution.sh index 6c5ae0d6290ea..c39b3b38168c8 100755 --- a/dev/make-distribution.sh +++ b/dev/make-distribution.sh @@ -220,6 +220,8 @@ cp -r "$SPARK_HOME/data" "$DISTDIR" if [ "$MAKE_PIP" == "true" ]; then echo "Building python distribution package" pushd "$SPARK_HOME/python" > /dev/null + # Delete the egg info file if it exists, this can cache older setup files. + rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" python setup.py sdist popd > /dev/null else diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 430c2ab52766a..0d4f65ac27217 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -18,6 +18,7 @@ from __future__ import print_function from pyspark.sql import SparkSession +from pyspark.ml.param import Params import sys if __name__ == "__main__": diff --git a/dev/requirements.txt b/dev/requirements.txt index bf042d22a8b47..79782279f8fbd 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -1,3 +1,4 @@ jira==1.0.3 PyGithub==1.26.0 Unidecode==0.04.19 +pypandoc==1.3.3 diff --git a/dev/run-pip-tests b/dev/run-pip-tests index e1da18e60bb3d..9cb37e1e7829c 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -78,11 +78,13 @@ for python in "${PYTHON_EXECS[@]}"; do mkdir -p "$VIRTUALENV_PATH" virtualenv --python=$python "$VIRTUALENV_PATH" source "$VIRTUALENV_PATH"/bin/activate - # Upgrade pip - pip install --upgrade pip + # Upgrade pip & friends + pip install --upgrade pip pypandoc wheel echo "Creating pip installable source dist" cd "$FWDIR"/python + # Delete the egg info file if it exists, this can cache the setup file. + rm -rf pyspark.egg-info || echo "No existing egg info file, skipping deletion" $python setup.py sdist diff --git a/python/setup.py b/python/setup.py index bc2eb4ce9dbd0..58f50437f1627 100644 --- a/python/setup.py +++ b/python/setup.py @@ -163,6 +163,9 @@ def _supports_symlinks(): packages=['pyspark', 'pyspark.mllib', 'pyspark.ml', + 'pyspark.ml.param', + 'pyspark.ml.stat', + 'pyspark.ml.linalg', 'pyspark.sql', 'pyspark.streaming', 'pyspark.bin', From b28d9ca5e553e453b34d6199549d845ff5b6e1e2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Tue, 3 Jan 2017 17:06:24 -0800 Subject: [PATCH 2/2] Install numpy so we can check the mllib linalg imports --- dev/pip-sanity-check.py | 1 + dev/run-pip-tests | 1 + python/setup.py | 4 +++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dev/pip-sanity-check.py b/dev/pip-sanity-check.py index 0d4f65ac27217..c491005f49719 100644 --- a/dev/pip-sanity-check.py +++ b/dev/pip-sanity-check.py @@ -19,6 +19,7 @@ from pyspark.sql import SparkSession from pyspark.ml.param import Params +from pyspark.mllib.linalg import * import sys if __name__ == "__main__": diff --git a/dev/run-pip-tests b/dev/run-pip-tests index 9cb37e1e7829c..af1b1feb70cd1 100755 --- a/dev/run-pip-tests +++ b/dev/run-pip-tests @@ -80,6 +80,7 @@ for python in "${PYTHON_EXECS[@]}"; do source "$VIRTUALENV_PATH"/bin/activate # Upgrade pip & friends pip install --upgrade pip pypandoc wheel + pip install numpy # Needed so we can verify mllib imports echo "Creating pip installable source dist" cd "$FWDIR"/python diff --git a/python/setup.py b/python/setup.py index 58f50437f1627..47eab98e0f7b3 100644 --- a/python/setup.py +++ b/python/setup.py @@ -162,10 +162,12 @@ def _supports_symlinks(): url='https://github.com/apache/spark/tree/master/python', packages=['pyspark', 'pyspark.mllib', + 'pyspark.mllib.linalg', + 'pyspark.mllib.stat', 'pyspark.ml', + 'pyspark.ml.linalg', 'pyspark.ml.param', 'pyspark.ml.stat', - 'pyspark.ml.linalg', 'pyspark.sql', 'pyspark.streaming', 'pyspark.bin',