From 42f9778e53500cbf5c5b996baf6b2d3dfb46c8a2 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Wed, 11 Sep 2024 12:05:24 +0000 Subject: [PATCH 01/10] adds additional functionality to cover more partitioning capability --- sqlalchemy_bigquery/base.py | 54 +++++++++++-- tests/unit/test_table_options.py | 134 +++++++++++++++++++------------ 2 files changed, 131 insertions(+), 57 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index c531c102..84a9338f 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -812,7 +812,9 @@ def _raise_for_type(self, option, value, expected_type): ) def _process_time_partitioning( - self, table: Table, time_partitioning: TimePartitioning + self, + table: Table, + time_partitioning: TimePartitioning, ): """ Generates a SQL 'PARTITION BY' clause for partitioning a table by a date or timestamp. @@ -830,23 +832,61 @@ def _process_time_partitioning( - Given a table with a TIMESTAMP type column 'event_timestamp' and setting 'time_partitioning.field' to 'event_timestamp', the function returns "PARTITION BY TIMESTAMP_TRUNC(event_timestamp, DAY)". + + Current inputs allowed by BQ and covered by this function include: + * _PARTITIONDATE + * DATETIME_TRUNC(, DAY/HOUR/MONTH/YEAR) + * TIMESTAMP_TRUNC(, DAY/HOUR/MONTH/YEAR) + * DATE_TRUNC(, MONTH/YEAR) + + Additional allowed options not explicitly covered by this function + include: + * DATE(_PARTITIONTIME) + * DATE() + * DATE() + * DATE column """ + field = "_PARTITIONDATE" trunc_fn = "DATE_TRUNC" + # Format used with _PARTITIONDATE which can only be used for + # DAY / MONTH / YEAR + if time_partitioning.field is None and field == "_PARTITIONDATE": + if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: + return f"PARTITION BY {trunc_fn}({field})" + else: + raise ValueError( + f"_PARTITIONDATE can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" + ) + if time_partitioning.field is not None: field = time_partitioning.field + if isinstance( - table.columns[time_partitioning.field].type, - sqlalchemy.sql.sqltypes.DATE, + table.columns[field].type, + (sqlalchemy.sql.sqltypes.TIMESTAMP), ): - return f"PARTITION BY {field}" + trunc_fn = "TIMESTAMP_TRUNC" elif isinstance( - table.columns[time_partitioning.field].type, - sqlalchemy.sql.sqltypes.TIMESTAMP, + table.columns[field].type, + sqlalchemy.sql.sqltypes.DATETIME, ): - trunc_fn = "TIMESTAMP_TRUNC" + trunc_fn = "DATETIME_TRUNC" + + if isinstance( + table.columns[field].type, + sqlalchemy.sql.sqltypes.DATE, + ): + if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: + # CHECK for type: DAY/MONTH/YEAR + trunc_fn = "DATE_TRUNC" + else: + raise ValueError( + f"DATE_TRUNC can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" + ) + # Format used with generically with DATE, TIMESTAMP, DATETIME return f"PARTITION BY {trunc_fn}({field}, {time_partitioning.type_})" def _process_range_partitioning( diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 2b757e04..985d42bd 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -104,20 +104,59 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): ) -def test_table_time_partitioning_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables +# DATETIME with type and field +@pytest.mark.parametrize( + "column_dtype,time_partitioning_type,func_name", + [ + # DATE dtype + pytest.param( + sqlalchemy.DATE, + TimePartitioningType.HOUR, + "DATE_TRUNC", + marks=pytest.mark.xfail, + ), + (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), + # TIMESTAMP dtype + (sqlalchemy.TIMESTAMP, TimePartitioningType.HOUR, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.DAY, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.MONTH, "TIMESTAMP_TRUNC"), + (sqlalchemy.TIMESTAMP, TimePartitioningType.YEAR, "TIMESTAMP_TRUNC"), + # DATETIME dtype + (sqlalchemy.DATETIME, TimePartitioningType.HOUR, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.DAY, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.MONTH, "DATETIME_TRUNC"), + (sqlalchemy.DATETIME, TimePartitioningType.YEAR, "DATETIME_TRUNC"), + ], +) +def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( + faux_conn, column_dtype, time_partitioning_type, func_name +): + """Expect table creation to fail as SQLite does not support partitioned tables + + Each parametrization ensures that the appropriate function is generated + depending on whether the column datatype is DATE, TIMESTAMP, DATETIME and + whether the TimePartitioningType is HOUR, DAY, MONTH, YEAR. + + Notes: + * BigQuery will not partition on DATE by HOUR, so that is expected to xfail. + """ + with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table", sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning(), + sqlalchemy.Column("createdAt", column_dtype), + bigquery_time_partitioning=TimePartitioning( + field="createdAt", type_=time_partitioning_type + ), ) assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(_PARTITIONDATE, DAY)" + f"CREATE TABLE `some_table` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" + f" PARTITION BY {func_name}(createdAt, {time_partitioning_type})" ) @@ -139,75 +178,70 @@ def test_table_require_partition_filter_dialect_option(faux_conn): ) +# DATETIME WITH FIELD but no TYPE: defaults to DAY def test_table_time_partitioning_with_field_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables - with pytest.raises(sqlite3.OperationalError): - setup_table( - faux_conn, - "some_table", - sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning(field="createdAt"), - ) - - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" - ) + """Expect table creation to fail as SQLite does not support partitioned tables + Confirms that if the column datatype is DATETIME but no TIMEPARTITIONINGTYPE + has been supplied, the system will default to DAY. + """ -def test_table_time_partitioning_by_month_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table", sqlalchemy.Column("id", sqlalchemy.Integer), sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning( - field="createdAt", - type_=TimePartitioningType.MONTH, - ), - ) - - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, MONTH)" - ) - - -def test_table_time_partitioning_with_timestamp_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables - with pytest.raises(sqlite3.OperationalError): - setup_table( - faux_conn, - "some_table", - sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.TIMESTAMP), bigquery_time_partitioning=TimePartitioning(field="createdAt"), ) assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table` ( `id` INT64, `createdAt` TIMESTAMP )" - " PARTITION BY TIMESTAMP_TRUNC(createdAt, DAY)" + "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" + " PARTITION BY DATE_TRUNC(createdAt, DAY)" ) -def test_table_time_partitioning_with_date_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables +@pytest.mark.parametrize( + "column_dtype,time_partitioning_type,func_name", + [ + pytest.param( + sqlalchemy.DATE, + TimePartitioningType.HOUR, + "DATE_TRUNC", + marks=pytest.mark.xfail, + ), + (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), + ], +) +def test_table_time_partitioning_with_partitiondate_option( + faux_conn, column_dtype, time_partitioning_type, func_name +): + """Expect table creation to fail as SQLite does not support partitioned tables + + Each parametrization ensures that the appropriate function is generated + depending on the pseudocolumn datatype is _PARTITIONDATE and whether the + TimePartitioningType is HOUR, DAY, MONTH, YEAR. + + Notes: + * BigQuery will not partition on _PARTITIONDATE by HOUR, so that is expected + to xfail. + """ with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table_2", + # schema=str([id_, createdAt]), sqlalchemy.Column("id", sqlalchemy.Integer), - sqlalchemy.Column("createdAt", sqlalchemy.DATE), - bigquery_time_partitioning=TimePartitioning(field="createdAt"), + sqlalchemy.Column("createdAt", column_dtype), + bigquery_time_partitioning=TimePartitioning(type_=time_partitioning_type), ) # confirm that the following code creates the correct SQL string assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( - "CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` DATE )" - " PARTITION BY createdAt" + f"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" + f" PARTITION BY {func_name}(_PARTITIONDATE)" ) From d64e0cfb05d94f175edd7fb1b6b4e6b0b1dfac2a Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 8 Nov 2024 21:27:46 +0000 Subject: [PATCH 02/10] Updates the partitioning algorithm and tests --- sqlalchemy_bigquery/base.py | 84 ++++++++++++++++---------------- tests/unit/test_table_options.py | 77 ++++++++++++++++++----------- 2 files changed, 89 insertions(+), 72 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index 84a9338f..e7574a21 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -817,20 +817,21 @@ def _process_time_partitioning( time_partitioning: TimePartitioning, ): """ - Generates a SQL 'PARTITION BY' clause for partitioning a table by a date or timestamp. + Generates a SQL 'PARTITION BY' clause for partitioning a table, Args: - - table (Table): The SQLAlchemy table object representing the BigQuery table to be partitioned. + - table (Table): The SQLAlchemy table object representing the BigQuery + table to be partitioned. - time_partitioning (TimePartitioning): The time partitioning details, including the field to be used for partitioning. Returns: - - str: A SQL 'PARTITION BY' clause that uses either TIMESTAMP_TRUNC or DATE_TRUNC to - partition data on the specified field. + - str: A SQL 'PARTITION BY' clause. Example: - - Given a table with a TIMESTAMP type column 'event_timestamp' and setting - 'time_partitioning.field' to 'event_timestamp', the function returns + - Given a table with an 'event_timestamp' and setting time_partitioning.type + as DAY and by setting 'time_partitioning.field' as 'event_timestamp', the + function returns: "PARTITION BY TIMESTAMP_TRUNC(event_timestamp, DAY)". Current inputs allowed by BQ and covered by this function include: @@ -839,55 +840,52 @@ def _process_time_partitioning( * TIMESTAMP_TRUNC(, DAY/HOUR/MONTH/YEAR) * DATE_TRUNC(, MONTH/YEAR) - Additional allowed options not explicitly covered by this function - include: + Additional options allowed by BQ but not explicitly covered by this + function include: * DATE(_PARTITIONTIME) * DATE() * DATE() * DATE column """ - field = "_PARTITIONDATE" - trunc_fn = "DATE_TRUNC" - - # Format used with _PARTITIONDATE which can only be used for - # DAY / MONTH / YEAR - if time_partitioning.field is None and field == "_PARTITIONDATE": - if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: - return f"PARTITION BY {trunc_fn}({field})" - else: - raise ValueError( - f"_PARTITIONDATE can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" - ) + sqltypes = { + "_PARTITIONDATE": ("_PARTITIONDATE", None), + "TIMESTAMP": ("TIMESTAMP_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}), + "DATETIME": ("DATETIME_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}), + "DATE": ("DATE_TRUNC", {"DAY", "MONTH", "YEAR"}), + } + # Extract field (i.e or _PARTITIONDATE) + # AND extract the name of the column_type (i.e. "TIMESTAMP", "DATE", + # "DATETIME", "_PARTITIONDATE") if time_partitioning.field is not None: field = time_partitioning.field + column_type = table.columns[field].type.__visit_name__.upper() - if isinstance( - table.columns[field].type, - (sqlalchemy.sql.sqltypes.TIMESTAMP), - ): - trunc_fn = "TIMESTAMP_TRUNC" - elif isinstance( - table.columns[field].type, - sqlalchemy.sql.sqltypes.DATETIME, - ): - trunc_fn = "DATETIME_TRUNC" + else: + field = "_PARTITIONDATE" + column_type = "_PARTITIONDATE" - if isinstance( - table.columns[field].type, - sqlalchemy.sql.sqltypes.DATE, - ): - if time_partitioning.type_ in {"DAY", "MONTH", "YEAR"}: - # CHECK for type: DAY/MONTH/YEAR - trunc_fn = "DATE_TRUNC" - else: - raise ValueError( - f"DATE_TRUNC can only be used with TimePartitioningTypes {{DAY, MONTH, YEAR}} received {time_partitioning.type_}" - ) + # Extract time_partitioning.type_ (DAY, HOUR, MONTH, YEAR) + # i.e. generates one partition per type (1/DAY, 1/HOUR) + if time_partitioning.type_ is not None: + partitioning_period = time_partitioning.type_ + + # Extract the truncation_function (i.e. DATE_TRUNC) + # and the set of allowable partition_periods + # that can be used in that function + trunc_fn, allowed_partitions = sqltypes[column_type] + + # Create output: + # Special Case: _PARTITIONDATE does NOT use a function or partitioning_period + if trunc_fn == "_PARTITIONDATE": + return f"PARTITION BY {field}" + + # Generic Case + if partitioning_period not in allowed_partitions: + raise ValueError("error msg") - # Format used with generically with DATE, TIMESTAMP, DATETIME - return f"PARTITION BY {trunc_fn}({field}, {time_partitioning.type_})" + return f"PARTITION BY {trunc_fn}({field}, {partitioning_period})" def _process_range_partitioning( self, table: Table, range_partitioning: RangePartitioning diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 985d42bd..fa7cd3b3 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -104,7 +104,6 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): ) -# DATETIME with type and field @pytest.mark.parametrize( "column_dtype,time_partitioning_type,func_name", [ @@ -133,14 +132,17 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( faux_conn, column_dtype, time_partitioning_type, func_name ): - """Expect table creation to fail as SQLite does not support partitioned tables + """NOTE: Expect table creation to fail as SQLite does not support + partitioned tables, despite that, we are still able to test the generation + of SQL statements. Each parametrization ensures that the appropriate function is generated depending on whether the column datatype is DATE, TIMESTAMP, DATETIME and whether the TimePartitioningType is HOUR, DAY, MONTH, YEAR. - Notes: - * BigQuery will not partition on DATE by HOUR, so that is expected to xfail. + `DATE_TRUNC` only returns a result if TimePartitioningType is DAY, MONTH, + YEAR. BigQuery cannot partition on DATE by HOUR, so that is expected to + xfail. """ with pytest.raises(sqlite3.OperationalError): @@ -154,10 +156,12 @@ def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( ), ) - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( + result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) + expected = ( f"CREATE TABLE `some_table` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" f" PARTITION BY {func_name}(createdAt, {time_partitioning_type})" ) + assert result == expected def test_table_require_partition_filter_dialect_option(faux_conn): @@ -171,12 +175,15 @@ def test_table_require_partition_filter_dialect_option(faux_conn): bigquery_require_partition_filter=True, ) - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( + result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) + expected = ( "CREATE TABLE `some_table` ( `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" + " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" " OPTIONS(require_partition_filter=true)" ) + assert result == expected + # DATETIME WITH FIELD but no TYPE: defaults to DAY def test_table_time_partitioning_with_field_dialect_option(faux_conn): @@ -194,55 +201,64 @@ def test_table_time_partitioning_with_field_dialect_option(faux_conn): sqlalchemy.Column("createdAt", sqlalchemy.DateTime), bigquery_time_partitioning=TimePartitioning(field="createdAt"), ) - - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( + result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) + expected = ( "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" + " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" ) + print(f"DINOSAUR: {result}\n\n{expected}") + assert result == expected @pytest.mark.parametrize( - "column_dtype,time_partitioning_type,func_name", + "column_dtype,time_partitioning_type", [ pytest.param( sqlalchemy.DATE, TimePartitioningType.HOUR, - "DATE_TRUNC", marks=pytest.mark.xfail, ), - (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), - (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), - (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), + (sqlalchemy.DATE, TimePartitioningType.DAY), + (sqlalchemy.DATE, TimePartitioningType.MONTH), + (sqlalchemy.DATE, TimePartitioningType.YEAR), ], ) def test_table_time_partitioning_with_partitiondate_option( - faux_conn, column_dtype, time_partitioning_type, func_name + faux_conn, + column_dtype, + time_partitioning_type, ): - """Expect table creation to fail as SQLite does not support partitioned tables + """NOTE: Expect table creation to fail as SQLite does not support + partitioned tables, despite that, we are still able to test the generation + of SQL statements - Each parametrization ensures that the appropriate function is generated - depending on the pseudocolumn datatype is _PARTITIONDATE and whether the - TimePartitioningType is HOUR, DAY, MONTH, YEAR. + If the `field` argument to TimePartitioning() is not provided, it detaults to + None. That causes the pseudocolumn "_PARTITIONDATE" to be used by default as + the column to partition by. - Notes: - * BigQuery will not partition on _PARTITIONDATE by HOUR, so that is expected - to xfail. + _PARTITIONTIME only returns a result if TimePartitioningType is DAY, MONTH, + YEAR. BigQuery cannot partition on _PARTITIONDATE by HOUR, so that is + expected to xfail. """ + with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, "some_table_2", - # schema=str([id_, createdAt]), sqlalchemy.Column("id", sqlalchemy.Integer), sqlalchemy.Column("createdAt", column_dtype), bigquery_time_partitioning=TimePartitioning(type_=time_partitioning_type), ) # confirm that the following code creates the correct SQL string - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( + result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) + + # We need two versions of expected depending on whether we use _PARTITIONDATE + expected = ( f"CREATE TABLE `some_table_2` ( `id` INT64, `createdAt` {column_dtype.__visit_name__} )" - f" PARTITION BY {func_name}(_PARTITIONDATE)" + f" PARTITION BY _PARTITIONDATE" ) + assert result == expected def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_conn): @@ -261,7 +277,7 @@ def test_table_time_partitioning_dialect_option_partition_expiration_days(faux_c assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( "CREATE TABLE `some_table` ( `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" + " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" " OPTIONS(partition_expiration_days=0.25)" ) @@ -434,13 +450,16 @@ def test_table_all_dialect_option(faux_conn): ), ) - assert " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) == ( + result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) + expected = ( "CREATE TABLE `some_table` ( `id` INT64, `country` STRING, `town` STRING, `createdAt` DATETIME )" - " PARTITION BY DATE_TRUNC(createdAt, DAY)" + " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" " CLUSTER BY country, town" " OPTIONS(partition_expiration_days=30.0, expiration_timestamp=TIMESTAMP '2038-01-01 00:00:00+00:00', require_partition_filter=true, default_rounding_mode='ROUND_HALF_EVEN')" ) + assert result == expected + def test_validate_friendly_name_value_type(ddl_compiler): # expect option value to be transformed as a string expression From bb84d955d58bf649b491491eefbb67e68412e05e Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 11 Nov 2024 15:33:31 +0000 Subject: [PATCH 03/10] Updates special case and tests --- sqlalchemy_bigquery/base.py | 23 +++++++++++++++++++++-- tests/system/test_sqlalchemy_bigquery.py | 11 ++++++++++- tests/unit/test_table_options.py | 10 +++++++--- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index e7574a21..aabda386 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -852,7 +852,7 @@ def _process_time_partitioning( "_PARTITIONDATE": ("_PARTITIONDATE", None), "TIMESTAMP": ("TIMESTAMP_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}), "DATETIME": ("DATETIME_TRUNC", {"DAY", "HOUR", "MONTH", "YEAR"}), - "DATE": ("DATE_TRUNC", {"DAY", "MONTH", "YEAR"}), + "DATE": ("DATE_TRUNC", {"MONTH", "YEAR"}), } # Extract field (i.e or _PARTITIONDATE) @@ -868,6 +868,7 @@ def _process_time_partitioning( # Extract time_partitioning.type_ (DAY, HOUR, MONTH, YEAR) # i.e. generates one partition per type (1/DAY, 1/HOUR) + if time_partitioning.type_ is not None: partitioning_period = time_partitioning.type_ @@ -881,9 +882,27 @@ def _process_time_partitioning( if trunc_fn == "_PARTITIONDATE": return f"PARTITION BY {field}" + # Special Case: BigQuery will not accept DAY as partitioning_period for + # DATE_TRUNC. + # However, the default argument in python-bigquery for TimePartioning + # is DAY. This case overwrites that to avoid making a breaking change in + # python-bigquery. + # https://github.com/googleapis/python-bigquery/blob/a4d9534a900f13ae7355904cda05097d781f27e3/google/cloud/bigquery/table.py#L2916 + if trunc_fn == "DATE_TRUNC" and partitioning_period == "DAY": + raise ValueError( + "The TimePartitioning.type_ must be one of: " + f"{allowed_partitions}, received {partitioning_period}." + "NOTE: the `default` value for TimePartioning.type_ as set in " + "python-bigquery is 'DAY', if you wish to use 'DATE_TRUNC' " + "ensure that you overwrite the default TimePartitioning.type_. " + ) + # Generic Case if partitioning_period not in allowed_partitions: - raise ValueError("error msg") + raise ValueError( + "The TimePartitioning.type_ must be one of: " + f"{allowed_partitions}, received {partitioning_period}." + ) return f"PARTITION BY {trunc_fn}({field}, {partitioning_period})" diff --git a/tests/system/test_sqlalchemy_bigquery.py b/tests/system/test_sqlalchemy_bigquery.py index 7ea4ccc6..b3ebcc25 100644 --- a/tests/system/test_sqlalchemy_bigquery.py +++ b/tests/system/test_sqlalchemy_bigquery.py @@ -561,7 +561,16 @@ def test_dml(engine, session, table_dml): assert len(result) == 0 -@pytest.mark.parametrize("time_partitioning_field", ["timestamp_c", "date_c"]) +@pytest.mark.parametrize( + "time_partitioning_field", + [ + ("timestamp_c"), + ("datetime_c"), + # Fails because python-bigquery TimePartitioning.type_ defaults to "DAY", but + # the DATE_TRUNC() function only allows "MONTH"/"YEAR" + pytest.param("date_c", marks=[pytest.mark.xfail]), + ], +) def test_create_table(engine, bigquery_dataset, time_partitioning_field): meta = MetaData() Table( diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index fa7cd3b3..63471cbe 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -110,11 +110,16 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): # DATE dtype pytest.param( sqlalchemy.DATE, - TimePartitioningType.HOUR, + TimePartitioningType.HOUR, # Only MONTH/YEAR are permitted in BigQuery + "DATE_TRUNC", + marks=pytest.mark.xfail, + ), + pytest.param( + sqlalchemy.DATE, + TimePartitioningType.DAY, # Only MONTH/YEAR are permitted in BigQuery "DATE_TRUNC", marks=pytest.mark.xfail, ), - (sqlalchemy.DATE, TimePartitioningType.DAY, "DATE_TRUNC"), (sqlalchemy.DATE, TimePartitioningType.MONTH, "DATE_TRUNC"), (sqlalchemy.DATE, TimePartitioningType.YEAR, "DATE_TRUNC"), # TIMESTAMP dtype @@ -206,7 +211,6 @@ def test_table_time_partitioning_with_field_dialect_option(faux_conn): "CREATE TABLE `some_table` ( `id` INT64, `createdAt` DATETIME )" " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" ) - print(f"DINOSAUR: {result}\n\n{expected}") assert result == expected From dffd70736e22a2a06940bd94ed0de32d6cecdc2f Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 11 Nov 2024 17:56:34 +0000 Subject: [PATCH 04/10] Updates test in possible effort to increase coverage. --- tests/unit/test_table_options.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 63471cbe..6e6fb9f6 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -132,6 +132,8 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): (sqlalchemy.DATETIME, TimePartitioningType.DAY, "DATETIME_TRUNC"), (sqlalchemy.DATETIME, TimePartitioningType.MONTH, "DATETIME_TRUNC"), (sqlalchemy.DATETIME, TimePartitioningType.YEAR, "DATETIME_TRUNC"), + # TimePartitioning.type_ == None + (sqlalchemy.DATETIME, None, "DATETIME_TRUNC"), ], ) def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( @@ -148,8 +150,14 @@ def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( `DATE_TRUNC` only returns a result if TimePartitioningType is DAY, MONTH, YEAR. BigQuery cannot partition on DATE by HOUR, so that is expected to xfail. + + IF time_partitioning_type is None, the __init__() in TimePartitioning will + overwrite it with TimePartitioningType.DAY as the default. """ + if time_partitioning_type is None: + time_partitioning_type = TimePartitioningType.DAY + with pytest.raises(sqlite3.OperationalError): setup_table( faux_conn, From 6eb26564a90e6c08086dc190113ea1aa1f5f65ea Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Mon, 11 Nov 2024 19:44:09 +0000 Subject: [PATCH 05/10] Tweaks the conditionals in time partitioning process --- sqlalchemy_bigquery/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index aabda386..4247f45a 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -868,9 +868,9 @@ def _process_time_partitioning( # Extract time_partitioning.type_ (DAY, HOUR, MONTH, YEAR) # i.e. generates one partition per type (1/DAY, 1/HOUR) - - if time_partitioning.type_ is not None: - partitioning_period = time_partitioning.type_ + # NOTE: if time_partitioning.type_ == None, it gets + # immediately overwritten by python-bigquery to a default of DAY. + partitioning_period = time_partitioning.type_ # Extract the truncation_function (i.e. DATE_TRUNC) # and the set of allowable partition_periods From f88e4728ed61b5b04761ed1c7a81a8c6cbc7e0f2 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Tue, 12 Nov 2024 19:58:45 +0000 Subject: [PATCH 06/10] Updates linting --- sqlalchemy_bigquery/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlalchemy_bigquery/base.py b/sqlalchemy_bigquery/base.py index 4247f45a..879bcb40 100644 --- a/sqlalchemy_bigquery/base.py +++ b/sqlalchemy_bigquery/base.py @@ -868,7 +868,7 @@ def _process_time_partitioning( # Extract time_partitioning.type_ (DAY, HOUR, MONTH, YEAR) # i.e. generates one partition per type (1/DAY, 1/HOUR) - # NOTE: if time_partitioning.type_ == None, it gets + # NOTE: if time_partitioning.type_ == None, it gets # immediately overwritten by python-bigquery to a default of DAY. partitioning_period = time_partitioning.type_ From 611922dc6731ccc36ffdee228075fa5cce316dd7 Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Fri, 15 Nov 2024 22:24:40 +0100 Subject: [PATCH 07/10] chore(deps): update all dependencies (#1136) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * Update protobuf to 5.28.3 * pin google-crc32c for python 3.7/3.8 * Pin mako for python 3.8 * Pin markupsafe for Python 3.8 * Pin pyparsing for python 3.8 * Pin pyparsing for Python 3.8 --------- Co-authored-by: Owl Bot Co-authored-by: Anthonios Partheniou --- dev_requirements.txt | 2 +- samples/snippets/requirements-test.txt | 15 +++---- samples/snippets/requirements.txt | 58 ++++++++++++++------------ 3 files changed, 40 insertions(+), 35 deletions(-) diff --git a/dev_requirements.txt b/dev_requirements.txt index dac67503..d69a162f 100644 --- a/dev_requirements.txt +++ b/dev_requirements.txt @@ -2,4 +2,4 @@ sqlalchemy>=2.0.15,<2.1.0 google-cloud-bigquery>=1.6.0 pytest===6.2.5 pytest-flake8===1.1.0 # versions 1.1.1 and above require pytest 7 -pytz==2024.1 +pytz==2024.2 diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index df0ceaeb..d73fde11 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,16 +1,17 @@ -attrs==23.2.0 +attrs==24.2.0 click==8.1.7 -google-auth==2.29.0 +google-auth==2.36.0 google-cloud-testutils==1.4.0 iniconfig==2.0.0 -packaging==24.0 +packaging==24.2 pluggy==1.5.0 py==1.11.0 -pyasn1==0.6.0 -pyasn1-modules==0.4.0 -pyparsing==3.1.2 +pyasn1==0.6.1 +pyasn1-modules==0.4.1 +pyparsing===3.1.4; python_version == '3.8' +pyparsing==3.2.0; python_version >= '3.9' pytest===6.2.5 rsa==4.9 six==1.16.0 toml==0.10.2 -typing-extensions==4.12.1 +typing-extensions==4.12.2 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index e30275cb..ac594392 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,33 +1,37 @@ -alembic==1.13.1 -certifi==2024.7.4 -charset-normalizer==3.3.2 -geoalchemy2==0.15.1 -google-api-core[grpc]==2.19.0 -google-auth==2.29.0 -google-cloud-bigquery==3.24.0 +alembic==1.14.0 +certifi==2024.8.30 +charset-normalizer==3.4.0 +geoalchemy2==0.16.0 +google-api-core[grpc]==2.23.0 +google-auth==2.36.0 +google-cloud-bigquery==3.27.0 google-cloud-core==2.4.1 -google-crc32c==1.5.0 -google-resumable-media==2.7.0 -googleapis-common-protos==1.63.1 -greenlet==3.0.3 -grpcio==1.62.2 -grpcio-status==1.62.2 -idna==3.7 -importlib-resources==6.4.0; python_version >= '3.8' -mako==1.3.5 -markupsafe==2.1.5 -packaging==24.0 -proto-plus==1.23.0 -protobuf===4.25.3 -pyasn1==0.6.0 -pyasn1-modules==0.4.0 -pyparsing==3.1.2 +google-crc32c===1.5.0; python_version == '3.8' +google-crc32c==1.6.0; python_version >= '3.9' +google-resumable-media==2.7.2 +googleapis-common-protos==1.66.0 +greenlet==3.1.1 +grpcio==1.67.1 +grpcio-status==1.67.1 +idna==3.10 +importlib-resources==6.4.5; python_version >= '3.8' +mako===1.3.5; python_version == '3.8' +mako==1.3.6; python_version >= '3.9' +markupsafe===2.1.5; python_version == '3.8' +markupsafe==3.0.2; python_version >= '3.9' +packaging==24.2 +proto-plus==1.25.0 +protobuf==5.28.3 +pyasn1==0.6.1 +pyasn1-modules==0.4.1 +pyparsing===3.1.4; python_version == '3.8' +pyparsing==3.2.0; python_version >= '3.9' python-dateutil==2.9.0.post0 -pytz==2024.1 +pytz==2024.2 requests==2.32.3 rsa==4.9 -shapely==2.0.4 +shapely==2.0.6 six==1.16.0 sqlalchemy===1.4.27 -typing-extensions==4.12.1 -urllib3==2.2.2 +typing-extensions==4.12.2 +urllib3==2.2.3 From 0e9c3b8e8ebc24f0fb860807acf6bae7551b0058 Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Thu, 21 Nov 2024 01:07:54 +0100 Subject: [PATCH 08/10] chore(deps): update all dependencies (#1140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(deps): update all dependencies * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- samples/snippets/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index ac594392..c3213aa4 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -11,8 +11,8 @@ google-crc32c==1.6.0; python_version >= '3.9' google-resumable-media==2.7.2 googleapis-common-protos==1.66.0 greenlet==3.1.1 -grpcio==1.67.1 -grpcio-status==1.67.1 +grpcio==1.68.0 +grpcio-status==1.68.0 idna==3.10 importlib-resources==6.4.5; python_version >= '3.8' mako===1.3.5; python_version == '3.8' From 2d25e61d40e25a444a0bbaef311bf5c117837193 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Thu, 21 Nov 2024 16:01:44 +0000 Subject: [PATCH 09/10] Removes duplicate test --- tests/unit/test_table_options.py | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 6e6fb9f6..909e5242 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -177,32 +177,10 @@ def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( assert result == expected -def test_table_require_partition_filter_dialect_option(faux_conn): - # expect table creation to fail as SQLite does not support partitioned tables - with pytest.raises(sqlite3.OperationalError): - setup_table( - faux_conn, - "some_table", - sqlalchemy.Column("createdAt", sqlalchemy.DateTime), - bigquery_time_partitioning=TimePartitioning(field="createdAt"), - bigquery_require_partition_filter=True, - ) - - result = " ".join(faux_conn.test_data["execute"][-1][0].strip().split()) - expected = ( - "CREATE TABLE `some_table` ( `createdAt` DATETIME )" - " PARTITION BY DATETIME_TRUNC(createdAt, DAY)" - " OPTIONS(require_partition_filter=true)" - ) - - assert result == expected - - -# DATETIME WITH FIELD but no TYPE: defaults to DAY def test_table_time_partitioning_with_field_dialect_option(faux_conn): """Expect table creation to fail as SQLite does not support partitioned tables - Confirms that if the column datatype is DATETIME but no TIMEPARTITIONINGTYPE + Confirms that if the column datatype is DATETIME but no TimePartitioning.type_ has been supplied, the system will default to DAY. """ From 2229784befdf707bc88b838d646519a60ca6eca3 Mon Sep 17 00:00:00 2001 From: chalmer lowe Date: Fri, 22 Nov 2024 11:24:43 +0000 Subject: [PATCH 10/10] Updates test names and docstrings --- tests/unit/test_table_options.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_table_options.py b/tests/unit/test_table_options.py index 909e5242..ea3322f9 100644 --- a/tests/unit/test_table_options.py +++ b/tests/unit/test_table_options.py @@ -136,7 +136,7 @@ def test_table_clustering_fields_dialect_option_type_error(faux_conn): (sqlalchemy.DATETIME, None, "DATETIME_TRUNC"), ], ) -def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( +def test_table_time_partitioning_given_field_and_type__dialect_options( faux_conn, column_dtype, time_partitioning_type, func_name ): """NOTE: Expect table creation to fail as SQLite does not support @@ -151,8 +151,12 @@ def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( YEAR. BigQuery cannot partition on DATE by HOUR, so that is expected to xfail. - IF time_partitioning_type is None, the __init__() in TimePartitioning will - overwrite it with TimePartitioningType.DAY as the default. + A distinguishing characteristic of this test is we provide an argument to + the TimePartitioning class for both field and type_. + + Special case: IF time_partitioning_type is None, the __init__() in the + TimePartitioning class will overwrite it with TimePartitioningType.DAY as + the default. """ if time_partitioning_type is None: @@ -177,11 +181,14 @@ def test_table_time_partitioning_date_timestamp_and_datetime_dialect_option( assert result == expected -def test_table_time_partitioning_with_field_dialect_option(faux_conn): +def test_table_time_partitioning_given_field_but_no_type__dialect_option(faux_conn): """Expect table creation to fail as SQLite does not support partitioned tables Confirms that if the column datatype is DATETIME but no TimePartitioning.type_ has been supplied, the system will default to DAY. + + A distinguishing characteristic of this test is we provide an argument to + the TimePartitioning class for field but not type_. """ with pytest.raises(sqlite3.OperationalError): @@ -213,7 +220,7 @@ def test_table_time_partitioning_with_field_dialect_option(faux_conn): (sqlalchemy.DATE, TimePartitioningType.YEAR), ], ) -def test_table_time_partitioning_with_partitiondate_option( +def test_table_time_partitioning_given_type__but_no_field_dialect_option( faux_conn, column_dtype, time_partitioning_type, @@ -222,13 +229,16 @@ def test_table_time_partitioning_with_partitiondate_option( partitioned tables, despite that, we are still able to test the generation of SQL statements - If the `field` argument to TimePartitioning() is not provided, it detaults to + If the `field` argument to TimePartitioning() is not provided, it defaults to None. That causes the pseudocolumn "_PARTITIONDATE" to be used by default as the column to partition by. _PARTITIONTIME only returns a result if TimePartitioningType is DAY, MONTH, YEAR. BigQuery cannot partition on _PARTITIONDATE by HOUR, so that is expected to xfail. + + A distinguishing characteristic of this test is we provide an argument to + the TimePartitioning class for type_ but not field. """ with pytest.raises(sqlite3.OperationalError):