datafold · sirupsen · Jun 17, 2022 · erezsh · Jun 17, 2022 · sirupsen
diff --git a/data_diff/databases/presto.py b/data_diff/databases/presto.py
@@ -50,7 +50,13 @@ def to_string(self, s: str):
 
     def _query(self, sql_code: str) -> list:
         "Uses the standard SQL cursor interface"
-        return _query_conn(self._conn, sql_code)
+        c = self._conn.cursor()
+        c.execute(sql_code)
+        if sql_code.lower().startswith("select"):
+            return c.fetchall()
+        # Required for the query to actually run 🤯
+        if re.match(r"(insert|create|truncate|drop)", sql_code, re.IGNORECASE):
+            return c.fetchone()
 
     def close(self):
         self._conn.close()
@@ -88,7 +94,7 @@ def _parse_type(
                 datetime_precision = int(m.group(1))
                 return cls(
                     precision=datetime_precision if datetime_precision is not None else DEFAULT_DATETIME_PRECISION,
-                    rounds=False,
+                    rounds=self.ROUNDS_ON_PREC_LOSS,
                 )
 
         number_regexps = {r"decimal\((\d+),(\d+)\)": Decimal}

diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -403,6 +403,13 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
             f"size: {table2.max_key-table1.min_key}"
         )
 
+        # The entire segment wasn't below the threshold, but the next set of
+        # segments might be. In that case, it's useless to checksum them.
+        max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
+        if max_rows_from_keys < self.bisection_threshold:
+            yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
+            return
+
         (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
         if count1 == 0 and count2 == 0:

diff --git a/data_diff/sql.py b/data_diff/sql.py
@@ -115,7 +115,10 @@ class Checksum(Sql):
 
     def compile(self, c: Compiler):
         compiled_exprs = ", ".join(map(c.compile, self.exprs))
-        expr = f"concat({compiled_exprs})"
+        expr = compiled_exprs
+        if len(self.exprs) > 1:
+            expr = f"concat({compiled_exprs})"
+
         md5 = c.database.md5_to_int(expr)
         return f"sum({md5})"
 

diff --git a/dev/presto-conf/standalone/catalog/postgresql.properties b/dev/presto-conf/standalone/catalog/postgresql.properties
@@ -2,3 +2,4 @@ connector.name=postgresql
 connection-url=jdbc:postgresql://postgres:5432/postgres
 connection-user=postgres
 connection-password=Password1
+allow-drop-table=true
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -4,13 +4,23 @@ services:
     postgres:
       container_name: postgresql
       image: postgres:14.1-alpine
+      shm_size: 1g
       # work_mem: less tmp files
       # maintenance_work_mem: improve table-level op perf
       # max_wal_size: allow more time before merging to heap
       command: >
-        -c work_mem=1GB
-        -c maintenance_work_mem=1GB
-        -c max_wal_size=8GB
+        -c shared_buffers=16GB
+        -c effective_cache_size=48GB
+        -c maintenance_work_mem=2GB
+        -c checkpoint_completion_target=0.9
+        -c default_statistics_target=100
+        -c random_page_cost=1.1
+        -c effective_io_concurrency=200
+        -c work_mem=20971kB
+        -c max_worker_processes=14
+        -c max_parallel_workers_per_gather=4
+        -c max_parallel_workers=14
+        -c max_parallel_maintenance_workers=4
       restart: always
       volumes:
         - postgresql-data:/var/lib/postgresql/data:delegated

diff --git a/tests/common.py b/tests/common.py
@@ -2,8 +2,21 @@
 
 from data_diff import databases as db
 import logging
+import os
 
-logging.basicConfig(level=logging.INFO)
+DEFAULT_N_SAMPLES = 50
+N_SAMPLES = int(os.environ.get('N_SAMPLES', DEFAULT_N_SAMPLES))
+BENCHMARK = os.environ.get('BENCHMARK', False)
+
+level = logging.WARN
+if os.environ.get('DEBUG', False):
+    level = logging.DEBUG
+
+logging.basicConfig(level=level)
+logging.getLogger("diff_tables").setLevel(level)
+logging.getLogger("database").setLevel(level)
+if BENCHMARK:
+    logging.getLogger("benchmark").setLevel(logging.DEBUG)
 
 TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
 TEST_POSTGRESQL_CONN_STRING: str = None