Refactor decimal conversion in PyArrow tables to use direct casting (#544)

jayantsing-db · web-flow · commit 3463b12f6613 · 2025-05-12T08:52:14.000+05:30
This PR replaces the previous implementation of convert_decimals_in_arrow_table() with a more efficient approach that uses PyArrow's native casting operation instead of going through pandas conversion and array creation. - Remove conversion to pandas DataFrame via to_pandas() and apply() methods - Remove intermediate steps of creating array from decimal column and setting it back - Replace with direct type casting using PyArrow's cast() method - Build a new table with transformed columns rather than modifying the original table - Create a new schema based on the modified fields The new approach is more performant by avoiding pandas conversion overhead. The table below highlights substantial performance improvements when retrieving all rows from a table containing decimal columns, particularly when compression is disabled. Even greater gains were observed with compression enabled—showing approximately an 84% improvement (6 seconds compared to 39 seconds). Benchmarking was performed against e2-dogfood, with the client located in the us-west-2 region. ![image](https://github.com/user-attachments/assets/5407b651-8ab6-4c13-b525-cf912f503ba0) Signed-off-by: Jayant Singh <jayant.singh@databricks.com>
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -611,21 +611,31 @@ def convert_arrow_based_set_to_arrow_table(arrow_batches, lz4_compressed, schema
 
 
 def convert_decimals_in_arrow_table(table, description) -> "pyarrow.Table":
+    new_columns = []
+    new_fields = []
+
     for i, col in enumerate(table.itercolumns()):
+        field = table.field(i)
+
         if description[i][1] == "decimal":
-            decimal_col = col.to_pandas().apply(
-                lambda v: v if v is None else Decimal(v)
-            )
             precision, scale = description[i][4], description[i][5]
             assert scale is not None
             assert precision is not None
-            # Spark limits decimal to a maximum scale of 38,
-            # so 128 is guaranteed to be big enough
+            # create the target decimal type
             dtype = pyarrow.decimal128(precision, scale)
-            col_data = pyarrow.array(decimal_col, type=dtype)
-            field = table.field(i).with_type(dtype)
-            table = table.set_column(i, field, col_data)
-    return table
+
+            new_col = col.cast(dtype)
+            new_field = field.with_type(dtype)
+
+            new_columns.append(new_col)
+            new_fields.append(new_field)
+        else:
+            new_columns.append(col)
+            new_fields.append(field)
+
+    new_schema = pyarrow.schema(new_fields)
+
+    return pyarrow.Table.from_arrays(new_columns, schema=new_schema)
 
 
 def convert_to_assigned_datatypes_in_column_table(column_table, description):