Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions dev/provision.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,17 @@
from pyiceberg.schema import Schema
from pyiceberg.types import FixedType, NestedField, UUIDType

spark = SparkSession.builder.getOrCreate()
# The configuration is important, otherwise we get many small
# parquet files with a single row. When a positional delete
# hits the Parquet file with one row, the parquet file gets
# dropped instead of having a merge-on-read delete file.
spark = (
SparkSession
.builder
.config("spark.sql.shuffle.partitions", "1")
.config("spark.default.parallelism", "1")
.getOrCreate()
)

catalogs = {
'rest': load_catalog(
Expand Down Expand Up @@ -120,10 +130,6 @@
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years")

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_positional_mor_deletes
Expand Down Expand Up @@ -168,10 +174,6 @@
"""
)

# Partitioning is not really needed, but there is a bug:
# https://github.com/apache/iceberg/pull/7685
spark.sql(f"ALTER TABLE {catalog_name}.default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years")

spark.sql(
f"""
INSERT INTO {catalog_name}.default.test_positional_mor_double_deletes
Expand Down