File tree Expand file tree Collapse file tree 3 files changed +31
-3
lines changed Expand file tree Collapse file tree 3 files changed +31
-3
lines changed Original file line number Diff line number Diff line change @@ -34,7 +34,7 @@ class PandasConversionMixin(object):
34
34
"""
35
35
36
36
@since (1.3 )
37
- def toPandas (self ):
37
+ def toPandas (self , selfDestruct = False ):
38
38
"""
39
39
Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.
40
40
@@ -103,10 +103,22 @@ def toPandas(self):
103
103
batches = self .toDF (* tmp_column_names )._collect_as_arrow ()
104
104
if len (batches ) > 0 :
105
105
table = pyarrow .Table .from_batches (batches )
106
+ del batches
106
107
# Pandas DataFrame created from PyArrow uses datetime64[ns] for date type
107
108
# values, but we should use datetime.date to match the behavior with when
108
109
# Arrow optimization is disabled.
109
- pdf = table .to_pandas (date_as_object = True )
110
+ pandas_options = {'date_as_object' : True }
111
+ if selfDestruct :
112
+ # Configure PyArrow to use as little memory as possible:
113
+ # self_destruct - free columns as they are converted
114
+ # split_blocks - create a separate Pandas block for each column
115
+ # use_threads - convert one column at a time
116
+ pandas_options .update ({
117
+ 'self_destruct' : True ,
118
+ 'split_blocks' : True ,
119
+ 'use_threads' : False ,
120
+ })
121
+ pdf = table .to_pandas (** pandas_options )
110
122
# Rename back to the original column names.
111
123
pdf .columns = self .columns
112
124
for field in self .schema :
Original file line number Diff line number Diff line change @@ -90,7 +90,17 @@ def load_stream(self, stream):
90
90
import pyarrow as pa
91
91
reader = pa .ipc .open_stream (stream )
92
92
for batch in reader :
93
- yield batch
93
+ # In the case toPandas is called with selfDestruct=True,
94
+ # ensure each column in each record batch is contained in
95
+ # its own allocation. Otherwise, selfDestruct does
96
+ # nothing; it frees each column as its converted, but each
97
+ # column will actually be a list of slices of record
98
+ # batches, and so no memory is actually freed until all
99
+ # columns are converted.
100
+ split_batch = pa .RecordBatch .from_arrays ([
101
+ pa .concat_arrays ([array ]) for array in batch
102
+ ], schema = batch .schema )
103
+ yield split_batch
94
104
95
105
def __repr__ (self ):
96
106
return "ArrowStreamSerializer"
Original file line number Diff line number Diff line change @@ -190,6 +190,12 @@ def test_pandas_round_trip(self):
190
190
pdf_arrow = df .toPandas ()
191
191
assert_frame_equal (pdf_arrow , pdf )
192
192
193
+ def test_pandas_self_destruct (self ):
194
+ pdf = self .create_pandas_data_frame ()
195
+ df = self .spark .createDataFrame (self .data , schema = self .schema )
196
+ pdf_arrow = df .toPandas (selfDestruct = True )
197
+ assert_frame_equal (pdf_arrow , pdf )
198
+
193
199
def test_filtered_frame (self ):
194
200
df = self .spark .range (3 ).toDF ("i" )
195
201
pdf = df .filter ("i < 0" ).toPandas ()
You can’t perform that action at this time.
0 commit comments