diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 251625ae412cf..e6d8e9f24a557 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -413,7 +413,7 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone): # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up - pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) + pdf_slices = (pdf.iloc[start:start + step] for start in xrange(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py index c59765dd79eb9..913b43b6ddb5a 100644 --- a/python/pyspark/sql/tests/test_arrow.py +++ b/python/pyspark/sql/tests/test_arrow.py @@ -442,6 +442,12 @@ def test_createDateFrame_with_category_type(self): self.assertIsInstance(arrow_first_category_element, str) self.assertIsInstance(spark_first_category_element, str) + def test_createDataFrame_with_float_index(self): + # SPARK-32098: float index should not produce duplicated or truncated Spark DataFrame + self.assertEqual( + self.spark.createDataFrame( + pd.DataFrame({'a': [1, 2, 3]}, index=[2., 3., 4.])).distinct().count(), 3) + @unittest.skipIf( not have_pandas or not have_pyarrow,