Address comments

HyukjinKwon · HyukjinKwon · commit 7641fd090eab · 2018-02-27T12:42:48.000+09:00
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -1995,13 +1995,12 @@ def toPandas(self):
                 to_arrow_schema(self.schema)
             except Exception as e:
 
-                if self.sql_ctx.getConf("spark.sql.execution.arrow.fallback.enabled", "false") \
+                if self.sql_ctx.getConf("spark.sql.execution.arrow.fallback.enabled", "true") \
                         .lower() == "true":
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                        "failed by the reason below:\n"
-                        "  %s\n"
+                        "failed by the reason below:\n  %s\n"
                         "Attempts non-optimization as "
                         "'spark.sql.execution.arrow.fallback.enabled' is set to "
                         "true." % _exception_message(e))
@@ -2011,8 +2010,7 @@ def toPandas(self):
                     msg = (
                         "toPandas attempted Arrow optimization because "
                         "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                        "failed by the reason below:\n"
-                        "  %s\n"
+                        "failed by the reason below:\n  %s\n"
                         "For fallback to non-optimization automatically, please set true to "
                         "'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
                     raise RuntimeError(msg)
diff --git a/python/pyspark/sql/session.py b/python/pyspark/sql/session.py
@@ -668,13 +668,12 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
                 except Exception as e:
                     from pyspark.util import _exception_message
 
-                    if self.conf.get("spark.sql.execution.arrow.fallback.enabled", "false") \
+                    if self.conf.get("spark.sql.execution.arrow.fallback.enabled", "true") \
                             .lower() == "true":
                         msg = (
                             "createDataFrame attempted Arrow optimization because "
                             "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                            "failed by the reason below:\n"
-                            "  %s\n"
+                            "failed by the reason below:\n  %s\n"
                             "Attempts non-optimization as "
                             "'spark.sql.execution.arrow.fallback.enabled' is set to "
                             "true." % _exception_message(e))
@@ -683,8 +682,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=Tr
                         msg = (
                             "createDataFrame attempted Arrow optimization because "
                             "'spark.sql.execution.arrow.enabled' is set to true; however, "
-                            "failed by the reason below:\n"
-                            "  %s\n"
+                            "failed by the reason below:\n  %s\n"
                             "For fallback to non-optimization automatically, please set true to "
                             "'spark.sql.execution.arrow.fallback.enabled'." % _exception_message(e))
                         raise RuntimeError(msg)
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -197,6 +197,23 @@ def tearDownClass(cls):
         ReusedPySparkTestCase.tearDownClass()
         cls.spark.stop()
 
+    @contextmanager
+    def sql_conf(self, key, value):
+        """
+        A convenient context manager to test some configuration specific logic. This sets the
+        configurations then restores it back.
+        """
+
+        orig_value = self.spark.conf.get(key, None)
+        self.spark.conf.set(key, value)
+        try:
+            yield
+        finally:
+            if orig_value is None:
+                self.spark.conf.unset(key)
+            else:
+                self.spark.conf.set(key, orig_value)
+
     def assertPandasEqual(self, expected, result):
         msg = ("DataFrames are not equal: " +
                "\n\nExpected:\n%s\n%s" % (expected, expected.dtypes) +
@@ -3460,6 +3477,8 @@ def setUpClass(cls):
 
         cls.spark.conf.set("spark.sql.session.timeZone", tz)
         cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
+        # Disable fallback by default to easily detect the failures.
+        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false")
         cls.schema = StructType([
             StructField("1_str_t", StringType(), True),
             StructField("2_int_t", IntegerType(), True),
@@ -3495,22 +3514,10 @@ def create_pandas_data_frame(self):
         data_dict["4_float_t"] = np.float32(data_dict["4_float_t"])
         return pd.DataFrame(data=data_dict)
 
-    @contextmanager
-    def arrow_fallback(self, enabled):
-        orig_value = self.spark.conf.get("spark.sql.execution.arrow.fallback.enabled", None)
-        self.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", enabled)
-        try:
-            yield
-        finally:
-            if orig_value is None:
-                self.spark.conf.unset("spark.sql.execution.arrow.fallback.enabled")
-            else:
-                self.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", orig_value)
-
     def test_toPandas_fallback_enabled(self):
         import pandas as pd
 
-        with self.arrow_fallback(True):
+        with self.sql_conf("spark.sql.execution.arrow.fallback.enabled", True):
             schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
             df = self.spark.createDataFrame([({u'a': 1},)], schema=schema)
             with QuietTest(self.sc):
@@ -3525,7 +3532,7 @@ def test_toPandas_fallback_enabled(self):
                     self.assertPandasEqual(pdf, pd.DataFrame({u'map': [{u'a': 1}]}))
 
     def test_toPandas_fallback_disabled(self):
-        with self.arrow_fallback(False):
+        with self.sql_conf("spark.sql.execution.arrow.fallback.enabled", False):
             schema = StructType([StructField("map", MapType(StringType(), IntegerType()), True)])
             df = self.spark.createDataFrame([(None,)], schema=schema)
             with QuietTest(self.sc):
@@ -3650,7 +3657,7 @@ def test_createDataFrame_with_incorrect_schema(self):
         pdf = self.create_pandas_data_frame()
         wrong_schema = StructType(list(reversed(self.schema)))
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(TypeError, ".*field.*can.not.accept.*type"):
+            with self.assertRaisesRegexp(RuntimeError, ".*No cast.*string.*timestamp.*"):
                 self.spark.createDataFrame(pdf, schema=wrong_schema)
 
     def test_createDataFrame_with_names(self):
@@ -3675,7 +3682,7 @@ def test_createDataFrame_column_name_encoding(self):
     def test_createDataFrame_with_single_data_type(self):
         import pandas as pd
         with QuietTest(self.sc):
-            with self.assertRaisesRegexp(TypeError, ".*IntegerType.*tuple"):
+            with self.assertRaisesRegexp(RuntimeError, ".*IntegerType.*not supported.*"):
                 self.spark.createDataFrame(pd.DataFrame({"a": [1]}), schema="int")
 
     def test_createDataFrame_does_not_modify_input(self):
@@ -3734,7 +3741,7 @@ def test_createDataFrame_fallback_enabled(self):
         import pandas as pd
 
         with QuietTest(self.sc):
-            with self.arrow_fallback(True):
+            with self.sql_conf("spark.sql.execution.arrow.fallback.enabled", True):
                 with warnings.catch_warnings(record=True) as warns:
                     df = self.spark.createDataFrame(
                         pd.DataFrame([[{u'a': 1}]]), "a: map<string, int>")
@@ -3750,7 +3757,7 @@ def test_createDataFrame_fallback_disabled(self):
         import pandas as pd
 
         with QuietTest(self.sc):
-            with self.arrow_fallback(False):
+            with self.sql_conf("spark.sql.execution.arrow.fallback.enabled", False):
                 with self.assertRaisesRegexp(Exception, 'Unsupported type'):
                     self.spark.createDataFrame(
                         pd.DataFrame([[{u'a': 1}]]), "a: map<string, int>")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1070,9 +1070,8 @@ object SQLConf {
 
   val ARROW_FALLBACK_ENABLE =
     buildConf("spark.sql.execution.arrow.fallback.enabled")
-      .doc("When true, the optimization by 'spark.sql.execution.arrow.enabled' " +
-        "could be disabled when it is unable to be used, and fallback to " +
-        "non-optimization.")
+      .doc("When true, optimizations enabled by 'spark.sql.execution.arrow.enabled' will " +
+        "fallback automatically to non-optimized implementations if an error occurs.")
       .booleanConf
       .createWithDefault(true)