diff --git a/tests/conftest.py b/tests/conftest.py index 584b6c633a..79560bc532 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,7 @@ import boto3 import pytest from moto import mock_aws +from pydantic_core import to_json from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.catalog.noop import NoopCatalog @@ -67,10 +68,12 @@ ) from pyiceberg.io.fsspec import FsspecFileIO from pyiceberg.manifest import DataFile, FileFormat +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Accessor, Schema from pyiceberg.serializers import ToOutputFile from pyiceberg.table import FileScanTask, Table from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2 +from pyiceberg.transforms import DayTransform, IdentityTransform from pyiceberg.types import ( BinaryType, BooleanType, @@ -1255,8 +1258,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 15, "value": 0}, ], "lower_bounds": [ - {"key": 2, "value": b"2020-04-01 00:00"}, - {"key": 3, "value": b"2020-04-01 00:12"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 7, "value": b"\x03\x00\x00\x00"}, {"key": 8, "value": b"\x01\x00\x00\x00"}, {"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"}, @@ -1270,8 +1273,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"}, ], "upper_bounds": [ - {"key": 2, "value": b"2020-04-30 23:5:"}, - {"key": 3, "value": b"2020-05-01 00:41"}, + {"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, {"key": 7, "value": b"\t\x01\x00\x00"}, {"key": 8, "value": b"\t\x01\x00\x00"}, {"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"}, @@ -1376,8 +1379,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "lower_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"2020-04-01 00:00"}, - {"key": 3, "value": b"2020-04-01 00:03"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 4, "value": b"\x00\x00\x00\x00"}, {"key": 5, "value": b"\x01\x00\x00\x00"}, {"key": 6, "value": b"N"}, @@ -1396,8 +1399,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "upper_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"2020-04-30 23:5:"}, - {"key": 3, "value": b"2020-05-01 00:1:"}, + {"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, {"key": 4, "value": b"\x06\x00\x00\x00"}, {"key": 5, "value": b"c\x00\x00\x00"}, {"key": 6, "value": b"Y"}, @@ -1858,7 +1861,24 @@ def simple_map() -> MapType: @pytest.fixture(scope="session") -def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]: +def test_schema() -> Schema: + return Schema( + NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", TimestampType(), False) + ) + + +@pytest.fixture(scope="session") +def test_partition_spec() -> Schema: + return PartitionSpec( + PartitionField(1, 1000, IdentityTransform(), "VendorID"), + PartitionField(2, 1001, DayTransform(), "tpep_pickup_day"), + ) + + +@pytest.fixture(scope="session") +def generated_manifest_entry_file( + avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec +) -> Generator[str, None, None]: from fastavro import parse_schema, writer parsed_schema = parse_schema(avro_schema_manifest_entry) @@ -1866,7 +1886,15 @@ def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/manifest.avro" with open(tmp_avro_file, "wb") as out: - writer(out, parsed_schema, manifest_entry_records) + writer( + out, + parsed_schema, + manifest_entry_records, + metadata={ + "schema": test_schema.model_dump_json(), + "partition-spec": to_json(test_partition_spec.fields).decode("utf-8"), + }, + ) yield tmp_avro_file diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 825431c776..7c62b9564c 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -38,10 +38,9 @@ write_manifest, write_manifest_list, ) -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table.snapshots import Operation, Snapshot, Summary -from pyiceberg.transforms import IdentityTransform from pyiceberg.typedef import Record, TableVersion from pyiceberg.types import IntegerType, NestedField @@ -154,8 +153,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:12", + 2: b"\x01\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x01\x00\x00\x00\x00\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -169,8 +168,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:41", + 2: b"\x06\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x06\x00\x00\x00\x00\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@", @@ -363,6 +362,8 @@ def test_write_manifest( generated_manifest_file_file_v1: str, generated_manifest_file_file_v2: str, format_version: TableVersion, + test_schema: Schema, + test_partition_spec: PartitionSpec, compression: AvroCompressionCodec, ) -> None: io = load_file_io() @@ -376,20 +377,12 @@ def test_write_manifest( ) demo_manifest_file = snapshot.manifests(io)[0] manifest_entries = demo_manifest_file.fetch_manifest_entry(io) - test_schema = Schema( - NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False) - ) - test_spec = PartitionSpec( - PartitionField(source_id=1, field_id=1, transform=IdentityTransform(), name="VendorID"), - PartitionField(source_id=2, field_id=2, transform=IdentityTransform(), name="tpep_pickup_datetime"), - spec_id=demo_manifest_file.partition_spec_id, - ) with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/test_write_manifest.avro" output = io.new_output(tmp_avro_file) with write_manifest( format_version=format_version, - spec=test_spec, + spec=test_partition_spec, schema=test_schema, output_file=output, snapshot_id=8744736658442914487, @@ -404,7 +397,7 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""", + "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""", "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } @@ -497,8 +490,8 @@ def test_write_manifest( } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:12", + 2: b"\x01\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x01\x00\x00\x00\x00\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -512,8 +505,8 @@ def test_write_manifest( 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:41", + 2: b"\x06\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x06\x00\x00\x00\x00\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@",