From 612426e34e29229aa2187e8775ee0e453288c75d Mon Sep 17 00:00:00 2001
From: HyukjinKwon <gurwls223@apache.org>
Date: Thu, 25 Jun 2020 20:06:35 +0900
Subject: [PATCH] Use iloc for positional slicing instead of direct slicing in
 createDataFrame with Arrow

---
 python/pyspark/sql/pandas/conversion.py | 2 +-
 python/pyspark/sql/tests/test_arrow.py  | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
index 251625ae412cf..e6d8e9f24a557 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -413,7 +413,7 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
 
         # Slice the DataFrame to be batched
         step = -(-len(pdf) // self.sparkContext.defaultParallelism)  # round int up
-        pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step))
+        pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step))
 
         # Create list of Arrow (columns, type) for serializer dump_stream
         arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)]
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
index c59765dd79eb9..913b43b6ddb5a 100644
--- a/python/pyspark/sql/tests/test_arrow.py
+++ b/python/pyspark/sql/tests/test_arrow.py
@@ -442,6 +442,12 @@ def test_createDateFrame_with_category_type(self):
         self.assertIsInstance(arrow_first_category_element, str)
         self.assertIsInstance(spark_first_category_element, str)
 
+    def test_createDataFrame_with_float_index(self):
+        # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame
+        self.assertEqual(
+            self.spark.createDataFrame(
+                pd.DataFrame({'a': [1, 2, 3]}, index=[2., 3., 4.])).distinct().count(), 3)
+
 
 @unittest.skipIf(
     not have_pandas or not have_pyarrow,