From 612426e34e29229aa2187e8775ee0e453288c75d Mon Sep 17 00:00:00 2001 From: HyukjinKwon Date: Thu, 25 Jun 2020 20:06:35 +0900 Subject: [PATCH] Use iloc for positional slicing instead of direct slicing in createDataFrame with Arrow --- python/pyspark/sql/pandas/conversion.py | 2 +- python/pyspark/sql/tests/test_arrow.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 251625ae412cf..e6d8e9f24a557 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -413,7 +413,7 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone): # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up - pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) + pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index c59765dd79eb9..913b43b6ddb5a 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -442,6 +442,12 @@ def test_createDateFrame_with_category_type(self): self.assertIsInstance(arrow_first_category_element, str) self.assertIsInstance(spark_first_category_element, str) + def test_createDataFrame_with_float_index(self): + # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame + self.assertEqual( + self.spark.createDataFrame( + pd.DataFrame({'a': [1, 2, 3]}, index=[2., 3., 4.])).distinct().count(), 3) + @unittest.skipIf( not have_pandas or not have_pyarrow,