[SPARK-32953][PYTHON] Add Arrow self_destruct support to toPandas

lidavidm · lidavidm · commit a6a189f869c9 · 2020-10-14T15:11:11.000-04:00
diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py
@@ -34,7 +34,7 @@ class PandasConversionMixin(object):
     """
 
     @since(1.3)
-    def toPandas(self):
+    def toPandas(self, selfDestruct=False):
         """
         Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
 
@@ -103,10 +103,22 @@ def toPandas(self):
                     batches = self.toDF(*tmp_column_names)._collect_as_arrow()
                     if len(batches) > 0:
                         table = pyarrow.Table.from_batches(batches)
+                        del batches
                         # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
                         # values, but we should use datetime.date to match the behavior with when
                         # Arrow optimization is disabled.
-                        pdf = table.to_pandas(date_as_object=True)
+                        pandas_options = {'date_as_object': True}
+                        if selfDestruct:
+                            # Configure PyArrow to use as little memory as possible:
+                            # self_destruct - free columns as they are converted
+                            # split_blocks - create a separate Pandas block for each column
+                            # use_threads - convert one column at a time
+                            pandas_options.update({
+                                'self_destruct': True,
+                                'split_blocks': True,
+                                'use_threads': False,
+                            })
+                        pdf = table.to_pandas(**pandas_options)
                         # Rename back to the original column names.
                         pdf.columns = self.columns
                         for field in self.schema:
diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py
@@ -90,7 +90,17 @@ def load_stream(self, stream):
         import pyarrow as pa
         reader = pa.ipc.open_stream(stream)
         for batch in reader:
-            yield batch
+            # In the case toPandas is called with selfDestruct=True,
+            # ensure each column in each record batch is contained in
+            # its own allocation. Otherwise, selfDestruct does
+            # nothing; it frees each column as its converted, but each
+            # column will actually be a list of slices of record
+            # batches, and so no memory is actually freed until all
+            # columns are converted.
+            split_batch = pa.RecordBatch.from_arrays([
+                pa.concat_arrays([array]) for array in batch
+            ], schema=batch.schema)
+            yield split_batch
 
     def __repr__(self):
         return "ArrowStreamSerializer"
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -190,6 +190,12 @@ def test_pandas_round_trip(self):
         pdf_arrow = df.toPandas()
         assert_frame_equal(pdf_arrow, pdf)
 
+    def test_pandas_self_destruct(self):
+        pdf = self.create_pandas_data_frame()
+        df = self.spark.createDataFrame(self.data, schema=self.schema)
+        pdf_arrow = df.toPandas(selfDestruct=True)
+        assert_frame_equal(pdf_arrow, pdf)
+
     def test_filtered_frame(self):
         df = self.spark.range(3).toDF("i")
         pdf = df.filter("i < 0").toPandas()