Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

tests: database_types dual-use for benchmarks #67

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions data_diff/databases/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@ def to_string(self, s: str):

def _query(self, sql_code: str) -> list:
"Uses the standard SQL cursor interface"
return _query_conn(self._conn, sql_code)
c = self._conn.cursor()
c.execute(sql_code)
if sql_code.lower().startswith("select"):
return c.fetchall()
# Required for the query to actually run 🤯
if re.match(r"(insert|create|truncate|drop)", sql_code, re.IGNORECASE):
return c.fetchone()

def close(self):
self._conn.close()
Expand Down Expand Up @@ -88,7 +94,7 @@ def _parse_type(
datetime_precision = int(m.group(1))
return cls(
precision=datetime_precision if datetime_precision is not None else DEFAULT_DATETIME_PRECISION,
rounds=False,
rounds=self.ROUNDS_ON_PREC_LOSS,
)

number_regexps = {r"decimal\((\d+),(\d+)\)": Decimal}
Expand Down
7 changes: 7 additions & 0 deletions data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,13 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
f"size: {table2.max_key-table1.min_key}"
)

# The entire segment wasn't below the threshold, but the next set of
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This belongs in a separate PR

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. The reason I included it here is because this is how the 'download' is a fair benchmark comparison where it splits into multiple segments, processing those in threads

# segments might be. In that case, it's useless to checksum them.
max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
if max_rows_from_keys < self.bisection_threshold:
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
return

(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])

if count1 == 0 and count2 == 0:
Expand Down
5 changes: 4 additions & 1 deletion data_diff/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,10 @@ class Checksum(Sql):

def compile(self, c: Compiler):
compiled_exprs = ", ".join(map(c.compile, self.exprs))
expr = f"concat({compiled_exprs})"
expr = compiled_exprs
if len(self.exprs) > 1:
expr = f"concat({compiled_exprs})"

md5 = c.database.md5_to_int(expr)
return f"sum({md5})"

Expand Down
1 change: 1 addition & 0 deletions dev/presto-conf/standalone/catalog/postgresql.properties
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ connector.name=postgresql
connection-url=jdbc:postgresql://postgres:5432/postgres
connection-user=postgres
connection-password=Password1
allow-drop-table=true
16 changes: 13 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,23 @@ services:
postgres:
container_name: postgresql
image: postgres:14.1-alpine
shm_size: 1g
# work_mem: less tmp files
# maintenance_work_mem: improve table-level op perf
# max_wal_size: allow more time before merging to heap
command: >
-c work_mem=1GB
-c maintenance_work_mem=1GB
-c max_wal_size=8GB
-c shared_buffers=16GB
-c effective_cache_size=48GB
-c maintenance_work_mem=2GB
-c checkpoint_completion_target=0.9
-c default_statistics_target=100
-c random_page_cost=1.1
-c effective_io_concurrency=200
-c work_mem=20971kB
-c max_worker_processes=14
-c max_parallel_workers_per_gather=4
-c max_parallel_workers=14
-c max_parallel_maintenance_workers=4
restart: always
volumes:
- postgresql-data:/var/lib/postgresql/data:delegated
Expand Down
15 changes: 14 additions & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,21 @@

from data_diff import databases as db
import logging
import os

logging.basicConfig(level=logging.INFO)
DEFAULT_N_SAMPLES = 50
N_SAMPLES = int(os.environ.get('N_SAMPLES', DEFAULT_N_SAMPLES))
BENCHMARK = os.environ.get('BENCHMARK', False)

level = logging.WARN
if os.environ.get('DEBUG', False):
level = logging.DEBUG

logging.basicConfig(level=level)
logging.getLogger("diff_tables").setLevel(level)
logging.getLogger("database").setLevel(level)
if BENCHMARK:
logging.getLogger("benchmark").setLevel(logging.DEBUG)

TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
TEST_POSTGRESQL_CONN_STRING: str = None
Expand Down
Loading