Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions src/Analyzer/FunctionSecretArgumentsFinderTreeNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,14 @@ class FunctionTreeNodeImpl : public AbstractFunction
{
public:
explicit ArgumentsTreeNode(const QueryTreeNodes * arguments_) : arguments(arguments_) {}
size_t size() const override { return arguments ? arguments->size() : 0; }
std::unique_ptr<Argument> at(size_t n) const override { return std::make_unique<ArgumentTreeNode>(arguments->at(n).get()); }
size_t size() const override
{ /// size withous skipped indexes
return arguments ? arguments->size() - skippedSize() : 0;
}
std::unique_ptr<Argument> at(size_t n) const override
{ /// n is relative index, some can be skipped
return std::make_unique<ArgumentTreeNode>(arguments->at(getRealIndex(n)).get());
}
private:
const QueryTreeNodes * arguments = nullptr;
};
Expand Down
2 changes: 1 addition & 1 deletion src/Databases/Iceberg/DatabaseIceberg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ StoragePtr DatabaseIceberg::tryGetTable(const String & name, ContextPtr context_

/// with_table_structure = false: because there will be
/// no table structure in table definition AST.
StorageObjectStorage::Configuration::initialize(*configuration, args, context_, /* with_table_structure */false, storage_settings.get());
configuration->initialize(args, context_, /* with_table_structure */false, storage_settings.get());

auto cluster_name = settings[DatabaseIcebergSetting::object_storage_cluster].value;

Expand Down
60 changes: 42 additions & 18 deletions src/Disks/DiskType.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace ErrorCodes
extern const int UNKNOWN_ELEMENT_IN_CONFIG;
}

MetadataStorageType metadataTypeFromString(const String & type)
MetadataStorageType metadataTypeFromString(const std::string & type)
{
auto check_type = Poco::toLower(type);
if (check_type == "local")
Expand Down Expand Up @@ -53,23 +53,47 @@ std::string DataSourceDescription::toString() const
case DataSourceType::RAM:
return "memory";
case DataSourceType::ObjectStorage:
{
switch (object_storage_type)
{
case ObjectStorageType::S3:
return "s3";
case ObjectStorageType::HDFS:
return "hdfs";
case ObjectStorageType::Azure:
return "azure_blob_storage";
case ObjectStorageType::Local:
return "local_blob_storage";
case ObjectStorageType::Web:
return "web";
case ObjectStorageType::None:
return "none";
}
}
return DB::toString(object_storage_type);
}
}

ObjectStorageType objectStorageTypeFromString(const std::string & type)
{
auto check_type = Poco::toLower(type);
if (check_type == "s3")
return ObjectStorageType::S3;
if (check_type == "hdfs")
return ObjectStorageType::HDFS;
if (check_type == "azure_blob_storage" || check_type == "azure")
return ObjectStorageType::Azure;
if (check_type == "local_blob_storage" || check_type == "local")
return ObjectStorageType::Local;
if (check_type == "web")
return ObjectStorageType::Web;
if (check_type == "none")
return ObjectStorageType::None;

throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG,
"Unknown object storage type: {}", type);
}

std::string toString(ObjectStorageType type)
{
switch (type)
{
case ObjectStorageType::S3:
return "s3";
case ObjectStorageType::HDFS:
return "hdfs";
case ObjectStorageType::Azure:
return "azure_blob_storage";
case ObjectStorageType::Local:
return "local_blob_storage";
case ObjectStorageType::Web:
return "web";
case ObjectStorageType::None:
return "none";
}
}

}
6 changes: 4 additions & 2 deletions src/Disks/DiskType.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ enum class MetadataStorageType : uint8_t
Memory,
};

MetadataStorageType metadataTypeFromString(const String & type);
String toString(DataSourceType data_source_type);
MetadataStorageType metadataTypeFromString(const std::string & type);

ObjectStorageType objectStorageTypeFromString(const std::string & type);
std::string toString(ObjectStorageType type);

struct DataSourceDescription
{
Expand Down
108 changes: 98 additions & 10 deletions src/Parsers/FunctionSecretArgumentsFinder.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
#include <Common/KnownObjectNames.h>
#include <Common/re2.h>
#include <Common/maskURIPassword.h>
#include <Common/NamedCollections/NamedCollections.h>
#include <Common/NamedCollections/NamedCollectionsFactory.h>
#include <Core/QualifiedTableName.h>
#include <base/defines.h>
#include <boost/algorithm/string/predicate.hpp>
#include <Poco/String.h>


namespace DB
Expand All @@ -29,6 +32,21 @@ class AbstractFunction
virtual ~Arguments() = default;
virtual size_t size() const = 0;
virtual std::unique_ptr<Argument> at(size_t n) const = 0;
void skipArgument(size_t n) { skipped_indexes.insert(n); }
void unskipArguments() { skipped_indexes.clear(); }
size_t getRealIndex(size_t n) const
{
for (auto idx : skipped_indexes)
{
if (n < idx)
break;
++n;
}
return n;
}
size_t skippedSize() const { return skipped_indexes.size(); }
private:
std::set<size_t> skipped_indexes;
Comment on lines +35 to +49
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kind of confusing. I get the idea, but not entirely and why it has to be done this way.

Why do you have to manually call getRealIndex in some places knowing you already implemented that in ARgumentsAST:

std::unique_ptr<Argument> at(size_t n) const override
        { /// n is relative index, some can be skipped
            return std::make_unique<ArgumentTreeNode>(arguments->at(getRealIndex(n)).get());
        }

Why unskip?

Copy link
Author

@ianton-ru ianton-ru Apr 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In markSecretArgument and maskAzureConnectionString index goes outside in code, which works with raw index, not through this local Arguments class.
Like here - https://github.com/ClickHouse/ClickHouse/blob/master/src/Analyzer/Resolve/QueryAnalyzer.cpp#L2932

};

virtual ~AbstractFunction() = default;
Expand Down Expand Up @@ -75,14 +93,15 @@ class FunctionSecretArgumentsFinder
{
if (index >= function->arguments->size())
return;
auto real_index = function->arguments->getRealIndex(index);
if (!result.count)
{
result.start = index;
result.start = real_index;
result.are_named = argument_is_named;
}
chassert(index >= result.start); /// We always check arguments consecutively
chassert(real_index >= result.start); /// We always check arguments consecutively
chassert(result.replacement.empty()); /// We shouldn't use replacement with masking other arguments
result.count = index + 1 - result.start;
result.count = real_index + 1 - result.start;
if (!argument_is_named)
result.are_named = false;
}
Expand All @@ -100,14 +119,18 @@ class FunctionSecretArgumentsFinder
{
findMongoDBSecretArguments();
}
else if (function->name() == "iceberg")
{
findIcebergFunctionSecretArguments();
}
else if ((function->name() == "s3") || (function->name() == "cosn") || (function->name() == "oss") ||
(function->name() == "deltaLake") || (function->name() == "hudi") || (function->name() == "iceberg") ||
(function->name() == "deltaLake") || (function->name() == "hudi") ||
(function->name() == "gcs") || (function->name() == "icebergS3"))
{
/// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...)
findS3FunctionSecretArguments(/* is_cluster_function= */ false);
}
else if (function->name() == "s3Cluster")
else if ((function->name() == "s3Cluster") || (function->name() == "icebergS3Cluster"))
{
/// s3Cluster('cluster_name', 'url', 'aws_access_key_id', 'aws_secret_access_key', ...)
findS3FunctionSecretArguments(/* is_cluster_function= */ true);
Expand All @@ -117,7 +140,7 @@ class FunctionSecretArgumentsFinder
/// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure)
findAzureBlobStorageFunctionSecretArguments(/* is_cluster_function= */ false);
}
else if (function->name() == "azureBlobStorageCluster")
else if ((function->name() == "azureBlobStorageCluster") || (function->name() == "icebergAzureCluster"))
{
/// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])
findAzureBlobStorageFunctionSecretArguments(/* is_cluster_function= */ true);
Expand Down Expand Up @@ -218,11 +241,18 @@ class FunctionSecretArgumentsFinder
findSecretNamedArgument("secret_access_key", 1);
return;
}
if (is_cluster_function && isNamedCollectionName(1))
{
/// s3Cluster(cluster, named_collection, ..., secret_access_key = 'secret_access_key', ...)
findSecretNamedArgument("secret_access_key", 2);
return;
}

/// We should check other arguments first because we don't need to do any replacement in case of
/// s3('url', NOSIGN, 'format' [, 'compression'] [, extra_credentials(..)] [, headers(..)])
/// s3('url', 'format', 'structure' [, 'compression'] [, extra_credentials(..)] [, headers(..)])
size_t count = excludeS3OrURLNestedMaps();

if ((url_arg_idx + 3 <= count) && (count <= url_arg_idx + 4))
{
String second_arg;
Expand Down Expand Up @@ -287,6 +317,48 @@ class FunctionSecretArgumentsFinder
markSecretArgument(url_arg_idx + 4);
}

std::string findIcebergStorageType()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you have findIcebergStorageType and extractDynamicStorageType?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Couldn't you implement the same remove argument approach instead of doing the "skipIndices" thing?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

findIcebergStorageType is used in logic to mask secrets, different storage types has secrets on different position, so need to detect type to choose proper method (findS3TableEngineSecretArguments or findAzureTableEngineSecretArguments). I use "skip" thing because can't remove argument - this code is used to writing in logs, and add arguments must be there.
In my opinion here hard to reuse code, and other old methods like findS3TableEngineSecretArguments don't reuse code, have a logic duplicate instead. Code reusing makes code more complex here.

{
std::string storage_type = "s3";

size_t count = function->arguments->size();
if (!count)
return storage_type;

auto storage_type_idx = findNamedArgument(&storage_type, "storage_type");
if (storage_type_idx != -1)
{
storage_type = Poco::toLower(storage_type);
function->arguments->skipArgument(storage_type_idx);
}
else if (isNamedCollectionName(0))
{
std::string collection_name;
if (function->arguments->at(0)->tryGetString(&collection_name, true))
{
NamedCollectionPtr collection = NamedCollectionFactory::instance().tryGet(collection_name);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In which case would NamedCollectionFactory::instance().tryGet fail to get a named collection? I am asking because I assumed isNamedCollectionName(0) was already making sure the named collection existed.

If there is a chance this will fail, shouldn't you throw?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isNamedCollectionName actually only check that this argument is an identifier. This code is only for masking passwords and other sensitive info in logs etc., and getting storage_type from named collection here is only because S3 and Azure have passwords in different positions. I believe that if this code can't detect type correctly it should not break query execution more than it was broken before, because is used for example when ClickHouse writes query when it already catch another exception.

if (collection && collection->has("storage_type"))
{
storage_type = Poco::toLower(collection->get<std::string>("storage_type"));
}
}
}

return storage_type;
}

void findIcebergFunctionSecretArguments()
{
auto storage_type = findIcebergStorageType();

if (storage_type == "s3")
findS3FunctionSecretArguments(false);
else if (storage_type == "azure")
findAzureBlobStorageFunctionSecretArguments(false);

function->arguments->unskipArguments();
}

bool maskAzureConnectionString(ssize_t url_arg_idx, bool argument_is_named = false, size_t start = 0)
{
String url_arg;
Expand All @@ -310,7 +382,7 @@ class FunctionSecretArgumentsFinder
if (RE2::Replace(&url_arg, account_key_pattern, "AccountKey=[HIDDEN]\\1"))
{
chassert(result.count == 0); /// We shouldn't use replacement with masking other arguments
result.start = url_arg_idx;
result.start = function->arguments->getRealIndex(url_arg_idx);
result.are_named = argument_is_named;
result.count = 1;
result.replacement = url_arg;
Expand Down Expand Up @@ -458,6 +530,7 @@ class FunctionSecretArgumentsFinder
void findTableEngineSecretArguments()
{
const String & engine_name = function->name();

if (engine_name == "ExternalDistributed")
{
/// ExternalDistributed('engine', 'host:port', 'database', 'table', 'user', 'password')
Expand All @@ -475,10 +548,13 @@ class FunctionSecretArgumentsFinder
{
findMongoDBSecretArguments();
}
else if (engine_name == "Iceberg")
{
findIcebergTableEngineSecretArguments();
}
else if ((engine_name == "S3") || (engine_name == "COSN") || (engine_name == "OSS")
|| (engine_name == "DeltaLake") || (engine_name == "Hudi")
|| (engine_name == "Iceberg") || (engine_name == "IcebergS3")
|| (engine_name == "S3Queue"))
|| (engine_name == "IcebergS3") || (engine_name == "S3Queue"))
{
/// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...)
findS3TableEngineSecretArguments();
Expand All @@ -487,7 +563,7 @@ class FunctionSecretArgumentsFinder
{
findURLSecretArguments();
}
else if (engine_name == "AzureBlobStorage")
else if ((engine_name == "AzureBlobStorage") || (engine_name == "IcebergAzure"))
{
findAzureBlobStorageTableEngineSecretArguments();
}
Expand Down Expand Up @@ -579,6 +655,18 @@ class FunctionSecretArgumentsFinder
markSecretArgument(url_arg_idx + 4);
}

void findIcebergTableEngineSecretArguments()
{
auto storage_type = findIcebergStorageType();

if (storage_type == "s3")
findS3TableEngineSecretArguments();
else if (storage_type == "azure")
findAzureBlobStorageTableEngineSecretArguments();

function->arguments->unskipArguments();
}

void findDatabaseEngineSecretArguments()
{
const String & engine_name = function->name();
Expand Down
9 changes: 6 additions & 3 deletions src/Parsers/FunctionSecretArgumentsFinderAST.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,13 @@ class FunctionAST : public AbstractFunction
{
public:
explicit ArgumentsAST(const ASTs * arguments_) : arguments(arguments_) {}
size_t size() const override { return arguments ? arguments->size() : 0; }
size_t size() const override
{ /// size withous skipped indexes
return arguments ? arguments->size() - skippedSize() : 0;
}
std::unique_ptr<Argument> at(size_t n) const override
{
return std::make_unique<ArgumentAST>(arguments->at(n).get());
{ /// n is relative index, some can be skipped
return std::make_unique<ArgumentAST>(arguments->at(getRealIndex(n)).get());
}
private:
const ASTs * arguments = nullptr;
Expand Down
1 change: 1 addition & 0 deletions src/Storages/ObjectStorage/Azure/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ const std::unordered_set<std::string_view> optional_configuration_keys = {
"account_key",
"connection_string",
"storage_account_url",
"storage_type",
};

void StorageAzureConfiguration::check(ContextPtr context) const
Expand Down
Loading
Loading