@@ -108,8 +108,7 @@ def normalize(self) -> ManifestType:
108
108
ManifestNormalizationException: Caught internally and handled by returning the original manifest.
109
109
"""
110
110
try :
111
- self ._deduplicate_minifest ()
112
- self ._reference_schemas ()
111
+ self ._deduplicate_manifest ()
113
112
114
113
return self ._normalized_manifest
115
114
except ManifestNormalizationException :
@@ -131,7 +130,7 @@ def _get_manifest_streams(self) -> Iterable[Dict[str, Any]]:
131
130
132
131
yield from []
133
132
134
- def _deduplicate_minifest (self ) -> None :
133
+ def _deduplicate_manifest (self ) -> None :
135
134
"""
136
135
Find commonalities in the input JSON structure and refactor it to avoid redundancy.
137
136
"""
@@ -141,9 +140,117 @@ def _deduplicate_minifest(self) -> None:
141
140
self ._prepare_definitions ()
142
141
# replace duplicates with references, if any
143
142
self ._handle_duplicates (self ._collect_duplicates ())
143
+ # replace parent streams with $refs
144
+ self ._replace_parent_streams_with_refs ()
145
+ # clean dangling fields after resolving $refs
146
+ self ._clean_dangling_fields ()
144
147
except Exception as e :
145
148
raise ManifestNormalizationException (str (e ))
146
149
150
+ def _replace_parent_streams_with_refs (self ) -> None :
151
+ """
152
+ For each stream in the manifest, if it has a retriever.partition_router with parent_stream_configs,
153
+ replace any 'stream' fields in those configs that are dicts and deeply equal to another stream object
154
+ with a $ref to the correct stream index.
155
+ """
156
+ streams = self ._normalized_manifest .get (STREAMS_TAG , [])
157
+
158
+ # Build a hash-to-index mapping for O(1) lookups
159
+ stream_hash_to_index = {}
160
+ for idx , stream in enumerate (streams ):
161
+ stream_hash = self ._hash_object (stream )
162
+ stream_hash_to_index [stream_hash ] = idx
163
+
164
+ for idx , stream in enumerate (streams ):
165
+ retriever = stream .get ("retriever" )
166
+ if not retriever :
167
+ continue
168
+ partition_router = retriever .get ("partition_router" )
169
+ routers = (
170
+ partition_router
171
+ if isinstance (partition_router , list )
172
+ else [partition_router ]
173
+ if partition_router
174
+ else []
175
+ )
176
+ for router in routers :
177
+ if not isinstance (router , dict ):
178
+ continue
179
+ if router .get ("type" ) != "SubstreamPartitionRouter" :
180
+ continue
181
+ parent_stream_configs = router .get ("parent_stream_configs" , [])
182
+ for parent_config in parent_stream_configs :
183
+ if not isinstance (parent_config , dict ):
184
+ continue
185
+ stream_ref = parent_config .get ("stream" )
186
+ # Only replace if it's a dict and matches any stream in the manifest
187
+ if stream_ref is not None and isinstance (stream_ref , dict ):
188
+ stream_ref_hash = self ._hash_object (stream_ref )
189
+ if stream_ref_hash in stream_hash_to_index :
190
+ parent_config ["stream" ] = {
191
+ "$ref" : f"#/streams/{ stream_hash_to_index [stream_ref_hash ]} "
192
+ }
193
+
194
+ def _clean_dangling_fields (self ) -> None :
195
+ """
196
+ Clean the manifest by removing unused definitions and schemas.
197
+ This method removes any definitions or schemas that are not referenced by any $ref in the manifest.
198
+ """
199
+
200
+ def find_all_refs (obj : Dict [str , Any ], refs : List [str ]) -> None :
201
+ """
202
+ Recursively find all $ref paths in the object.
203
+
204
+ Args:
205
+ obj: The object to search through
206
+ refs: List to store found reference paths
207
+ """
208
+ if not isinstance (obj , dict ):
209
+ return
210
+
211
+ for key , value in obj .items ():
212
+ if key == "$ref" and isinstance (value , str ):
213
+ # Remove the leading #/ from the ref path
214
+ refs .append (value [2 :])
215
+ elif isinstance (value , dict ):
216
+ find_all_refs (value , refs )
217
+ elif isinstance (value , list ):
218
+ for item in value :
219
+ if isinstance (item , dict ):
220
+ find_all_refs (item , refs )
221
+
222
+ def clean_section (section : Dict [str , Any ], section_path : str ) -> None :
223
+ """
224
+ Clean a section by removing unreferenced fields.
225
+
226
+ Args:
227
+ section: The section to clean
228
+ section_path: The path to this section in the manifest
229
+ """
230
+ for key in list (section .keys ()):
231
+ current_path = f"{ section_path } /{ key } "
232
+ # Check if this path is referenced or is a parent of a referenced path
233
+ if not any (ref .startswith (current_path ) for ref in all_refs ):
234
+ del section [key ]
235
+
236
+ # Find all references in the manifest
237
+ all_refs : List [str ] = []
238
+ find_all_refs (self ._normalized_manifest , all_refs )
239
+
240
+ # Clean definitions
241
+ if DEF_TAG in self ._normalized_manifest :
242
+ clean_section (self ._normalized_manifest [DEF_TAG ], DEF_TAG )
243
+ # Remove empty definitions section
244
+ if not self ._normalized_manifest [DEF_TAG ]:
245
+ del self ._normalized_manifest [DEF_TAG ]
246
+
247
+ # Clean schemas
248
+ if SCHEMAS_TAG in self ._normalized_manifest :
249
+ clean_section (self ._normalized_manifest [SCHEMAS_TAG ], SCHEMAS_TAG )
250
+ # Remove empty schemas section
251
+ if not self ._normalized_manifest [SCHEMAS_TAG ]:
252
+ del self ._normalized_manifest [SCHEMAS_TAG ]
253
+
147
254
def _prepare_definitions (self ) -> None :
148
255
"""
149
256
Clean the definitions in the manifest by removing unnecessary properties.
@@ -163,43 +270,6 @@ def _prepare_definitions(self) -> None:
163
270
if key != LINKED_TAG :
164
271
self ._normalized_manifest [DEF_TAG ].pop (key , None )
165
272
166
- def _extract_stream_schema (self , stream : Dict [str , Any ]) -> None :
167
- """
168
- Extract the schema from the stream and add it to the `schemas` tag.
169
- """
170
-
171
- stream_name = stream ["name" ]
172
- # copy the value of the SCHEMA_TAG to the SCHEMAS_TAG with the stream name as key
173
- schema = stream .get (SCHEMA_LOADER_TAG , {}).get (SCHEMA_TAG )
174
- if not SCHEMAS_TAG in self ._normalized_manifest .keys ():
175
- self ._normalized_manifest [SCHEMAS_TAG ] = {}
176
- # add stream schema to the SCHEMAS_TAG
177
- if not stream_name in self ._normalized_manifest [SCHEMAS_TAG ].keys ():
178
- # add the schema to the SCHEMAS_TAG with the stream name as key
179
- self ._normalized_manifest [SCHEMAS_TAG ][stream_name ] = schema
180
-
181
- def _reference_schemas (self ) -> None :
182
- """
183
- Set the schema reference for the given stream in the manifest.
184
- This function modifies the manifest in place.
185
- """
186
-
187
- # reference the stream schema for the stream to where it's stored
188
- if SCHEMAS_TAG in self ._normalized_manifest .keys ():
189
- for stream in self ._get_manifest_streams ():
190
- self ._extract_stream_schema (stream )
191
- self ._set_stream_schema_ref (stream )
192
-
193
- def _set_stream_schema_ref (self , stream : Dict [str , Any ]) -> None :
194
- """
195
- Set the schema reference for the given stream in the manifest.
196
- This function modifies the manifest in place.
197
- """
198
- stream_name = stream ["name" ]
199
- if SCHEMAS_TAG in self ._normalized_manifest .keys ():
200
- if stream_name in self ._normalized_manifest [SCHEMAS_TAG ]:
201
- stream [SCHEMA_LOADER_TAG ][SCHEMA_TAG ] = self ._create_schema_ref (stream_name )
202
-
203
273
def _replace_duplicates_with_refs (self , duplicates : DuplicatesType ) -> None :
204
274
"""
205
275
Process duplicate objects and replace them with references.
@@ -447,16 +517,3 @@ def _create_linked_definition_ref(self, type_key: str, key: str) -> Dict[str, st
447
517
"""
448
518
449
519
return {"$ref" : f"#/{ DEF_TAG } /{ LINKED_TAG } /{ type_key } /{ key } " }
450
-
451
- def _create_schema_ref (self , key : str ) -> Dict [str , str ]:
452
- """
453
- Create a reference object for stream schema using the specified key.
454
-
455
- Args:
456
- key: The reference key to use
457
-
458
- Returns:
459
- A reference object in the proper format
460
- """
461
-
462
- return {"$ref" : f"#/{ SCHEMAS_TAG } /{ key } " }
0 commit comments