15
15
#include < Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h>
16
16
#include " Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadataFilesCache.h"
17
17
#include < Interpreters/ExpressionActions.h>
18
+ #include < IO/CompressedReadBufferWrapper.h>
18
19
19
20
#include < Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
20
21
#include < Storages/ObjectStorage/DataLakes/Iceberg/Utils.h>
@@ -104,7 +105,8 @@ Poco::JSON::Object::Ptr getMetadataJSONObject(
104
105
StorageObjectStorage::ConfigurationPtr configuration_ptr,
105
106
IcebergMetadataFilesCachePtr cache_ptr,
106
107
const ContextPtr & local_context,
107
- LoggerPtr log)
108
+ LoggerPtr log,
109
+ CompressionMethod compression_method)
108
110
{
109
111
auto create_fn = [&]()
110
112
{
@@ -115,7 +117,14 @@ Poco::JSON::Object::Ptr getMetadataJSONObject(
115
117
if (cache_ptr)
116
118
read_settings.enable_filesystem_cache = false ;
117
119
118
- auto buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log, read_settings);
120
+ auto source_buf = StorageObjectStorageSource::createReadBuffer (object_info, object_storage, local_context, log);
121
+
122
+ std::unique_ptr<ReadBuffer> buf;
123
+ if (compression_method != CompressionMethod::None)
124
+ buf = wrapReadBufferWithCompressionMethod (std::move (source_buf), compression_method);
125
+ else
126
+ buf = std::move (source_buf);
127
+
119
128
String json_str;
120
129
readJSONObjectPossiblyInvalid (json_str, *buf);
121
130
return json_str;
@@ -274,7 +283,30 @@ Int32 IcebergMetadata::parseTableSchema(
274
283
}
275
284
}
276
285
277
- static std::pair<Int32, String> getMetadataFileAndVersion (const std::string & path)
286
+ struct MetadataFileWithInfo
287
+ {
288
+ Int32 version;
289
+ String path;
290
+ CompressionMethod compression_method;
291
+ };
292
+
293
+ static CompressionMethod getCompressionMethodFromMetadataFile (const String & path)
294
+ {
295
+ constexpr std::string_view metadata_suffix = " .metadata.json" ;
296
+
297
+ auto compression_method = chooseCompressionMethod (path, " auto" );
298
+
299
+ // / NOTE you will be surprised, but some metadata files store compression not in the end of the file name,
300
+ // / but somewhere in the middle of the file name, before metadata.json suffix.
301
+ // / Maybe history of Iceberg metadata files is not so long, but it is already full of surprises.
302
+ // / Example of weird engineering decisions: 00000-85befd5a-69c7-46d4-bca6-cfbd67f0f7e6.gz.metadata.json
303
+ if (compression_method == CompressionMethod::None && path.ends_with (metadata_suffix))
304
+ compression_method = chooseCompressionMethod (path.substr (0 , path.size () - metadata_suffix.size ()), " auto" );
305
+
306
+ return compression_method;
307
+ }
308
+
309
+ static MetadataFileWithInfo getMetadataFileAndVersion (const std::string & path)
278
310
{
279
311
String file_name (path.begin () + path.find_last_of (' /' ) + 1 , path.end ());
280
312
String version_str;
@@ -289,7 +321,10 @@ static std::pair<Int32, String> getMetadataFileAndVersion(const std::string & pa
289
321
throw Exception (
290
322
ErrorCodes::BAD_ARGUMENTS, " Bad metadata file name: {}. Expected vN.metadata.json where N is a number" , file_name);
291
323
292
- return std::make_pair (std::stoi (version_str), path);
324
+ return MetadataFileWithInfo{
325
+ .version = std::stoi (version_str),
326
+ .path = path,
327
+ .compression_method = getCompressionMethodFromMetadataFile (path)};
293
328
}
294
329
295
330
enum class MostRecentMetadataFileSelectionWay
@@ -300,7 +335,7 @@ enum class MostRecentMetadataFileSelectionWay
300
335
301
336
struct ShortMetadataFileInfo
302
337
{
303
- UInt32 version;
338
+ Int32 version;
304
339
UInt64 last_updated_ms;
305
340
String path;
306
341
};
@@ -312,7 +347,7 @@ struct ShortMetadataFileInfo
312
347
* 1) v<V>.metadata.json, where V - metadata version.
313
348
* 2) <V>-<random-uuid>.metadata.json, where V - metadata version
314
349
*/
315
- static std::pair<Int32, String> getLatestMetadataFileAndVersion (
350
+ static MetadataFileWithInfo getLatestMetadataFileAndVersion (
316
351
const ObjectStoragePtr & object_storage,
317
352
StorageObjectStorage::ConfigurationPtr configuration_ptr,
318
353
IcebergMetadataFilesCachePtr cache_ptr,
@@ -336,10 +371,10 @@ static std::pair<Int32, String> getLatestMetadataFileAndVersion(
336
371
metadata_files_with_versions.reserve (metadata_files.size ());
337
372
for (const auto & path : metadata_files)
338
373
{
339
- auto [version, metadata_file_path] = getMetadataFileAndVersion (path);
374
+ auto [version, metadata_file_path, compression_method ] = getMetadataFileAndVersion (path);
340
375
if (need_all_metadata_files_parsing)
341
376
{
342
- auto metadata_file_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, cache_ptr, local_context, log);
377
+ auto metadata_file_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, cache_ptr, local_context, log, compression_method );
343
378
if (table_uuid.has_value ())
344
379
{
345
380
if (metadata_file_object->has (f_table_uuid))
@@ -389,10 +424,11 @@ static std::pair<Int32, String> getLatestMetadataFileAndVersion(
389
424
[](const ShortMetadataFileInfo & a, const ShortMetadataFileInfo & b) { return a.version < b.version ; });
390
425
}
391
426
}();
392
- return {latest_metadata_file_info.version , latest_metadata_file_info.path };
427
+
428
+ return {latest_metadata_file_info.version , latest_metadata_file_info.path , getCompressionMethodFromMetadataFile (latest_metadata_file_info.path )};
393
429
}
394
430
395
- static std::pair<Int32, String> getLatestOrExplicitMetadataFileAndVersion (
431
+ static MetadataFileWithInfo getLatestOrExplicitMetadataFileAndVersion (
396
432
const ObjectStoragePtr & object_storage,
397
433
StorageObjectStorage::ConfigurationPtr configuration_ptr,
398
434
IcebergMetadataFilesCachePtr cache_ptr,
@@ -459,7 +495,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
459
495
460
496
std::lock_guard lock (mutex);
461
497
462
- const auto [metadata_version, metadata_file_path]
498
+ const auto [metadata_version, metadata_file_path, compression_method ]
463
499
= getLatestOrExplicitMetadataFileAndVersion (object_storage, configuration_ptr, manifest_cache, local_context, log.get ());
464
500
465
501
bool metadata_file_changed = false ;
@@ -469,7 +505,7 @@ bool IcebergMetadata::update(const ContextPtr & local_context)
469
505
metadata_file_changed = true ;
470
506
}
471
507
472
- auto metadata_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, manifest_cache, local_context, log);
508
+ auto metadata_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, manifest_cache, local_context, log, compression_method );
473
509
chassert (format_version == metadata_object->getValue <int >(f_format_version));
474
510
475
511
auto previous_snapshot_id = relevant_snapshot_id;
@@ -662,9 +698,9 @@ DataLakeMetadataPtr IcebergMetadata::create(
662
698
else
663
699
LOG_TRACE (log, " Not using in-memory cache for iceberg metadata files, because the setting use_iceberg_metadata_files_cache is false." );
664
700
665
- const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion (object_storage, configuration_ptr, cache_ptr, local_context, log.get ());
701
+ const auto [metadata_version, metadata_file_path, compression_method ] = getLatestOrExplicitMetadataFileAndVersion (object_storage, configuration_ptr, cache_ptr, local_context, log.get ());
666
702
667
- Poco::JSON::Object::Ptr object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, cache_ptr, local_context, log);
703
+ Poco::JSON::Object::Ptr object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, cache_ptr, local_context, log, compression_method );
668
704
669
705
auto format_version = object->getValue <int >(f_format_version);
670
706
return std::make_unique<IcebergMetadata>(object_storage, configuration_ptr, local_context, metadata_version, format_version, object, cache_ptr);
@@ -734,15 +770,15 @@ IcebergMetadata::IcebergHistory IcebergMetadata::getHistory(ContextPtr local_con
734
770
{
735
771
auto configuration_ptr = configuration.lock ();
736
772
737
- const auto [metadata_version, metadata_file_path] = getLatestOrExplicitMetadataFileAndVersion (object_storage, configuration_ptr, manifest_cache, local_context, log.get ());
773
+ const auto [metadata_version, metadata_file_path, compression_method ] = getLatestOrExplicitMetadataFileAndVersion (object_storage, configuration_ptr, manifest_cache, local_context, log.get ());
738
774
739
775
chassert ([&]()
740
776
{
741
777
SharedLockGuard lock (mutex);
742
778
return metadata_version == last_metadata_version;
743
779
}());
744
780
745
- auto metadata_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, manifest_cache, local_context, log);
781
+ auto metadata_object = getMetadataJSONObject (metadata_file_path, object_storage, configuration_ptr, manifest_cache, local_context, log, compression_method );
746
782
chassert ([&]()
747
783
{
748
784
SharedLockGuard lock (mutex);
0 commit comments