diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index b21a0f5613..d6a3ff1654 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -421,8 +421,9 @@ def set_when_positive(properties: Dict[str, str], num: int, property_name: str) def ancestors_of(current_snapshot: Optional[Snapshot], table_metadata: TableMetadata) -> Iterable[Snapshot]: """Get the ancestors of and including the given snapshot.""" - if current_snapshot: - yield current_snapshot - if current_snapshot.parent_snapshot_id is not None: - if parent := table_metadata.snapshot_by_id(current_snapshot.parent_snapshot_id): - yield from ancestors_of(parent, table_metadata) + snapshot = current_snapshot + while snapshot is not None: + yield snapshot + if snapshot.parent_snapshot_id is None: + break + snapshot = table_metadata.snapshot_by_id(snapshot.parent_snapshot_id) diff --git a/tests/conftest.py b/tests/conftest.py index d3f23689a2..ba3bf72d06 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,10 +29,11 @@ import re import socket import string +import time import uuid from datetime import date, datetime, timezone from pathlib import Path -from random import choice +from random import choice, randint from tempfile import TemporaryDirectory from typing import ( TYPE_CHECKING, @@ -731,6 +732,77 @@ def example_table_metadata_no_snapshot_v1() -> Dict[str, Any]: return EXAMPLE_TABLE_METADATA_NO_SNAPSHOT_V1 +@pytest.fixture +def example_table_metadata_v2_with_extensive_snapshots() -> Dict[str, Any]: + def generate_snapshot( + snapshot_id: int, + parent_snapshot_id: Optional[int] = None, + timestamp_ms: Optional[int] = None, + sequence_number: int = 0, + ) -> Dict[str, Any]: + return { + "snapshot-id": snapshot_id, + "parent-snapshot-id": parent_snapshot_id, + "timestamp-ms": timestamp_ms or int(time.time() * 1000), + "sequence-number": sequence_number, + "summary": {"operation": "append"}, + "manifest-list": f"s3://a/b/{snapshot_id}.avro", + } + + snapshots = [] + snapshot_log = [] + initial_snapshot_id = 3051729675574597004 + + for i in range(2000): + snapshot_id = initial_snapshot_id + i + parent_snapshot_id = snapshot_id - 1 if i > 0 else None + timestamp_ms = int(time.time() * 1000) - randint(0, 1000000) + snapshots.append(generate_snapshot(snapshot_id, parent_snapshot_id, timestamp_ms, i)) + snapshot_log.append({"snapshot-id": snapshot_id, "timestamp-ms": timestamp_ms}) + + return { + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + {"type": "struct", "schema-id": 0, "fields": [{"id": 1, "name": "x", "required": True, "type": "long"}]}, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [1, 2], + "fields": [ + {"id": 1, "name": "x", "required": True, "type": "long"}, + {"id": 2, "name": "y", "required": True, "type": "long", "doc": "comment"}, + {"id": 3, "name": "z", "required": True, "type": "long"}, + ], + }, + ], + "default-spec-id": 0, + "partition-specs": [{"spec-id": 0, "fields": [{"name": "x", "transform": "identity", "source-id": 1, "field-id": 1000}]}], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + {"transform": "identity", "source-id": 2, "direction": "asc", "null-order": "nulls-first"}, + {"transform": "bucket[4]", "source-id": 3, "direction": "desc", "null-order": "nulls-last"}, + ], + } + ], + "properties": {"read.split.target.size": "134217728"}, + "current-snapshot-id": initial_snapshot_id + 1999, + "snapshots": snapshots, + "snapshot-log": snapshot_log, + "metadata-log": [{"metadata-file": "s3://bucket/.../v1.json", "timestamp-ms": 1515100}], + "refs": {"test": {"snapshot-id": initial_snapshot_id, "type": "tag", "max-ref-age-ms": 10000000}}, + } + + EXAMPLE_TABLE_METADATA_V2 = { "format-version": 2, "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", @@ -1992,6 +2064,18 @@ def table_v2(example_table_metadata_v2: Dict[str, Any]) -> Table: ) +@pytest.fixture +def table_v2_with_extensive_snapshots(example_table_metadata_v2_with_extensive_snapshots: Dict[str, Any]) -> Table: + table_metadata = TableMetadataV2(**example_table_metadata_v2_with_extensive_snapshots) + return Table( + identifier=("database", "table"), + metadata=table_metadata, + metadata_location=f"{table_metadata.location}/uuid.metadata.json", + io=load_file_io(), + catalog=NoopCatalog("NoopCatalog"), + ) + + @pytest.fixture def bound_reference_str() -> BoundReference[str]: return BoundReference(field=NestedField(1, "field", StringType(), required=False), accessor=Accessor(position=0, inner=None)) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index 20b77b6abd..dd1c1ec09f 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -241,6 +241,21 @@ def test_ancestors_of(table_v2: Table) -> None: ] +def test_ancestors_of_recursive_error(table_v2_with_extensive_snapshots: Table) -> None: + # Test RecursionError: maximum recursion depth exceeded + assert ( + len( + list( + ancestors_of( + table_v2_with_extensive_snapshots.current_snapshot(), + table_v2_with_extensive_snapshots.metadata, + ) + ) + ) + == 2000 + ) + + def test_snapshot_by_id_does_not_exist(table_v2: Table) -> None: assert table_v2.snapshot_by_id(-1) is None