feat: Add thread-safe caching to InferredSchemaLoader

devin-ai-integration[bot] · aaronsteers · devin-ai-integration[bot] · commit 66db70eb25d3 · 2025-11-11T14:25:37.000Z
- Add internal memoization with threading.Lock to prevent duplicate schema inference - Cache schema after first call to avoid re-reading records on subsequent calls - This addresses the issue where get_json_schema() is called during read operations (in DeclarativePartition.read()), not just during discover - Add unit test to verify caching behavior (schema inference happens only once) Fixes performance issue identified by @maxi297 where InferredSchemaLoader would read up to record_sample_size records for every partition/slice during a sync. Co-Authored-By: AJ Steers <aj@airbyte.io>
diff --git a/airbyte_cdk/sources/declarative/schema/inferred_schema_loader.py b/airbyte_cdk/sources/declarative/schema/inferred_schema_loader.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
 #
 
+import threading
 from collections.abc import Mapping as ABCMapping
 from collections.abc import Sequence
 from dataclasses import InitVar, dataclass
@@ -69,44 +70,54 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
             raise ValueError(
                 "stream_name must be provided either directly or via the 'name' parameter"
             )
+        self._cached_schema: Mapping[str, Any] | None = None
+        self._lock = threading.Lock()
 
     def get_json_schema(self) -> Mapping[str, Any]:
         """
         Infers and returns a JSON schema by reading a sample of records from the stream.
 
         This method reads up to `record_sample_size` records from the stream and uses
-        the SchemaInferrer to generate a JSON schema. If no records are available,
-        it returns an empty schema.
+        the SchemaInferrer to generate a JSON schema. The schema is cached after the first
+        call to avoid re-reading records on subsequent calls (e.g., during partition reads).
 
         Returns:
             A mapping representing the inferred JSON schema for the stream
         """
-        schema_inferrer = SchemaInferrer()
+        if self._cached_schema is not None:
+            return self._cached_schema
 
-        record_count = 0
-        for stream_slice in self.retriever.stream_slices():
-            for record in self.retriever.read_records(records_schema={}, stream_slice=stream_slice):
-                if record_count >= self.record_sample_size:
-                    break
+        with self._lock:
+            if self._cached_schema is not None:
+                return self._cached_schema
 
-                # Convert all Mapping-like and Sequence-like objects to plain Python types
-                # This is necessary because genson doesn't handle custom implementations properly
-                record = _to_builtin_types(record)
+            schema_inferrer = SchemaInferrer()
 
-                airbyte_record = AirbyteRecordMessage(
-                    stream=self.stream_name,
-                    data=record,  # type: ignore[arg-type]
-                    emitted_at=0,
-                )
+            record_count = 0
+            for stream_slice in self.retriever.stream_slices():
+                for record in self.retriever.read_records(records_schema={}, stream_slice=stream_slice):
+                    if record_count >= self.record_sample_size:
+                        break
 
-                schema_inferrer.accumulate(airbyte_record)
-                record_count += 1
+                    # Convert all Mapping-like and Sequence-like objects to plain Python types
+                    # This is necessary because genson doesn't handle custom implementations properly
+                    record = _to_builtin_types(record)
 
-            if record_count >= self.record_sample_size:
-                break
+                    airbyte_record = AirbyteRecordMessage(
+                        stream=self.stream_name,
+                        data=record,  # type: ignore[arg-type]
+                        emitted_at=0,
+                    )
 
-        inferred_schema: Mapping[str, Any] | None = schema_inferrer.get_stream_schema(
-            self.stream_name
-        )
+                    schema_inferrer.accumulate(airbyte_record)
+                    record_count += 1
+
+                if record_count >= self.record_sample_size:
+                    break
+
+            inferred_schema: Mapping[str, Any] | None = schema_inferrer.get_stream_schema(
+                self.stream_name
+            )
 
-        return inferred_schema if inferred_schema else {}
+            self._cached_schema = inferred_schema if inferred_schema else {}
+            return self._cached_schema
diff --git a/unit_tests/sources/declarative/schema/test_inferred_schema_loader.py b/unit_tests/sources/declarative/schema/test_inferred_schema_loader.py
@@ -184,3 +184,33 @@ def test_inferred_schema_loader_with_arrays():
     assert "properties" in schema
     assert "tags" in schema["properties"]
     assert "array" in schema["properties"]["tags"]["type"]
+
+
+def test_inferred_schema_loader_caches_schema():
+    """Test that InferredSchemaLoader caches the schema and doesn't re-read records on subsequent calls."""
+    retriever = MagicMock()
+    retriever.stream_slices.return_value = iter([None])
+    retriever.read_records.return_value = iter(
+        [
+            {"id": 1, "name": "Alice"},
+            {"id": 2, "name": "Bob"},
+        ]
+    )
+
+    config = MagicMock()
+    parameters = {"name": "users"}
+    loader = InferredSchemaLoader(
+        retriever=retriever,
+        config=config,
+        parameters=parameters,
+        record_sample_size=2,
+        stream_name="users",
+    )
+
+    schema1 = loader.get_json_schema()
+    schema2 = loader.get_json_schema()
+    schema3 = loader.get_json_schema()
+
+    assert schema1 == schema2 == schema3
+    assert retriever.stream_slices.call_count == 1
+    assert retriever.read_records.call_count == 1