Skip to content

Commit 4c673c6

Browse files
techaddictjkbradley
authored andcommitted
[SPARK-18274][ML][PYSPARK] Memory leak in PySpark JavaWrapper
## What changes were proposed in this pull request? In`JavaWrapper `'s destructor make Java Gateway dereference object in destructor, using `SparkContext._active_spark_context._gateway.detach` Fixing the copying parameter bug, by moving the `copy` method from `JavaModel` to `JavaParams` ## How was this patch tested? ```scala import random, string from pyspark.ml.feature import StringIndexer l = [(''.join(random.choice(string.ascii_uppercase) for _ in range(10)), ) for _ in range(int(7e5))] # 700000 random strings of 10 characters df = spark.createDataFrame(l, ['string']) for i in range(50): indexer = StringIndexer(inputCol='string', outputCol='index') indexer.fit(df) ``` * Before: would keep StringIndexer strong reference, causing GC issues and is halted midway After: garbage collection works as the object is dereferenced, and computation completes * Mem footprint tested using profiler * Added a parameter copy related test which was failing before. Author: Sandeep Singh <[email protected]> Author: jkbradley <[email protected]> Closes #15843 from techaddict/SPARK-18274. (cherry picked from commit 78bb7f8) Signed-off-by: Joseph K. Bradley <[email protected]>
1 parent 6916ddc commit 4c673c6

File tree

2 files changed

+41
-18
lines changed

2 files changed

+41
-18
lines changed

python/pyspark/ml/tests.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,24 @@ def test_word2vec_param(self):
390390
self.assertEqual(model.getWindowSize(), 6)
391391

392392

393+
class EvaluatorTests(SparkSessionTestCase):
394+
395+
def test_java_params(self):
396+
"""
397+
This tests a bug fixed by SPARK-18274 which causes multiple copies
398+
of a Params instance in Python to be linked to the same Java instance.
399+
"""
400+
evaluator = RegressionEvaluator(metricName="r2")
401+
df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
402+
evaluator.evaluate(df)
403+
self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
404+
evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
405+
evaluator.evaluate(df)
406+
evaluatorCopy.evaluate(df)
407+
self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
408+
self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
409+
410+
393411
class FeatureTests(SparkSessionTestCase):
394412

395413
def test_binarizer(self):

python/pyspark/ml/wrapper.py

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ class JavaParams(JavaWrapper, Params):
7171

7272
__metaclass__ = ABCMeta
7373

74+
def __del__(self):
75+
if SparkContext._active_spark_context:
76+
SparkContext._active_spark_context._gateway.detach(self._java_obj)
77+
7478
def _make_java_param_pair(self, param, value):
7579
"""
7680
Makes a Java parm pair.
@@ -180,6 +184,25 @@ def __get_class(clazz):
180184
% stage_name)
181185
return py_stage
182186

187+
def copy(self, extra=None):
188+
"""
189+
Creates a copy of this instance with the same uid and some
190+
extra params. This implementation first calls Params.copy and
191+
then make a copy of the companion Java pipeline component with
192+
extra params. So both the Python wrapper and the Java pipeline
193+
component get copied.
194+
195+
:param extra: Extra parameters to copy to the new instance
196+
:return: Copy of this instance
197+
"""
198+
if extra is None:
199+
extra = dict()
200+
that = super(JavaParams, self).copy(extra)
201+
if self._java_obj is not None:
202+
that._java_obj = self._java_obj.copy(self._empty_java_param_map())
203+
that._transfer_params_to_java()
204+
return that
205+
183206

184207
@inherit_doc
185208
class JavaEstimator(JavaParams, Estimator):
@@ -256,21 +279,3 @@ def __init__(self, java_model=None):
256279
super(JavaModel, self).__init__(java_model)
257280
if java_model is not None:
258281
self._resetUid(java_model.uid())
259-
260-
def copy(self, extra=None):
261-
"""
262-
Creates a copy of this instance with the same uid and some
263-
extra params. This implementation first calls Params.copy and
264-
then make a copy of the companion Java model with extra params.
265-
So both the Python wrapper and the Java model get copied.
266-
267-
:param extra: Extra parameters to copy to the new instance
268-
:return: Copy of this instance
269-
"""
270-
if extra is None:
271-
extra = dict()
272-
that = super(JavaModel, self).copy(extra)
273-
if self._java_obj is not None:
274-
that._java_obj = self._java_obj.copy(self._empty_java_param_map())
275-
that._transfer_params_to_java()
276-
return that

0 commit comments

Comments
 (0)