Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/QueryPipeline/RemoteQueryExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -723,8 +723,12 @@ void RemoteQueryExecutor::processReadTaskRequest()
if (!extension || !extension->task_iterator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Distributed task iterator is not initialized");

if (!extension->replica_info)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Replica info is not initialized");

ProfileEvents::increment(ProfileEvents::ReadTaskRequestsReceived);
auto response = (*extension->task_iterator)();

auto response = (*extension->task_iterator)(extension->replica_info->number_of_current_replica);
connections->sendReadTaskResponse(response);
}

Expand Down
2 changes: 1 addition & 1 deletion src/QueryPipeline/RemoteQueryExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class RemoteQueryExecutorReadContext;
class ParallelReplicasReadingCoordinator;

/// This is the same type as StorageS3Source::IteratorWrapper
using TaskIterator = std::function<String()>;
using TaskIterator = std::function<String(size_t)>;

/// This class allows one to launch queries on remote replicas of one shard and get results
class RemoteQueryExecutor
Expand Down
73 changes: 47 additions & 26 deletions src/Storages/IStorageCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace Setting
extern const SettingsBool async_query_sending_for_remote;
extern const SettingsBool async_socket_for_remote;
extern const SettingsBool skip_unavailable_shards;
extern const SettingsNonZeroUInt64 max_parallel_replicas;
}

namespace ErrorCodes
Expand All @@ -59,15 +60,19 @@ void ReadFromCluster::applyFilters(ActionDAGNodes added_filter_nodes)
if (filter_actions_dag)
predicate = filter_actions_dag->getOutputs().at(0);

createExtension(predicate);
auto max_replicas_to_use = static_cast<UInt64>(cluster->getShardsInfo().size());
if (context->getSettingsRef()[Setting::max_parallel_replicas] > 1)
max_replicas_to_use = std::min(max_replicas_to_use, context->getSettingsRef()[Setting::max_parallel_replicas].value);

createExtension(predicate, max_replicas_to_use);
}

void ReadFromCluster::createExtension(const ActionsDAG::Node * predicate)
void ReadFromCluster::createExtension(const ActionsDAG::Node * predicate, size_t number_of_replicas)
{
if (extension)
return;

extension = storage->getTaskIteratorExtension(predicate, context);
extension = storage->getTaskIteratorExtension(predicate, context, number_of_replicas);
}

/// The code executes on initiator
Expand Down Expand Up @@ -155,38 +160,54 @@ SinkToStoragePtr IStorageCluster::write(

void ReadFromCluster::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
{
createExtension(nullptr);

const Scalars & scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{};
const bool add_agg_info = processed_stage == QueryProcessingStage::WithMergeableState;

Pipes pipes;
auto new_context = updateSettings(context->getSettingsRef());
const auto & current_settings = new_context->getSettingsRef();
auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);

size_t replica_index = 0;
auto max_replicas_to_use = static_cast<UInt64>(cluster->getShardsInfo().size());
if (current_settings[Setting::max_parallel_replicas] > 1)
max_replicas_to_use = std::min(max_replicas_to_use, current_settings[Setting::max_parallel_replicas].value);

createExtension(nullptr, max_replicas_to_use);

for (const auto & shard_info : cluster->getShardsInfo())
{
auto try_results = shard_info.pool->getMany(timeouts, current_settings, PoolMode::GET_MANY);
for (auto & try_result : try_results)
{
auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
std::vector<IConnectionPool::Entry>{try_result},
queryToString(query_to_send),
getOutputHeader(),
new_context,
/*throttler=*/nullptr,
scalars,
Tables(),
processed_stage,
extension);

remote_query_executor->setLogger(log);
pipes.emplace_back(std::make_shared<RemoteSource>(
remote_query_executor,
add_agg_info,
current_settings[Setting::async_socket_for_remote],
current_settings[Setting::async_query_sending_for_remote]));
}
/// We're taking all replicas as shards,
/// so each shard will have only one address to connect to.
auto try_results = shard_info.pool->getMany(
timeouts,
current_settings,
PoolMode::GET_ONE,
{},
/*skip_unavailable_endpoints=*/true);

if (try_results.empty())
continue;

IConnections::ReplicaInfo replica_info{ .number_of_current_replica = replica_index++ };

auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
std::vector<IConnectionPool::Entry>{try_results.front()},
queryToString(query_to_send),
getOutputHeader(),
new_context,
/*throttler=*/nullptr,
scalars,
Tables(),
processed_stage,
RemoteQueryExecutor::Extension{.task_iterator = extension->task_iterator, .replica_info = std::move(replica_info)});

remote_query_executor->setLogger(log);
pipes.emplace_back(std::make_shared<RemoteSource>(
remote_query_executor,
add_agg_info,
current_settings[Setting::async_socket_for_remote],
current_settings[Setting::async_query_sending_for_remote]));
}

auto pipe = Pipe::unitePipes(std::move(pipes));
Expand Down
4 changes: 2 additions & 2 deletions src/Storages/IStorageCluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class IStorageCluster : public IStorage

ClusterPtr getCluster(ContextPtr context) const { return getClusterImpl(context, cluster_name); }
/// Query is needed for pruning by virtual columns (_file, _path)
virtual RemoteQueryExecutor::Extension getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const = 0;
virtual RemoteQueryExecutor::Extension getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context, size_t number_of_replicas) const = 0;

QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override;

Expand Down Expand Up @@ -127,7 +127,7 @@ class ReadFromCluster : public SourceStepWithFilter

std::optional<RemoteQueryExecutor::Extension> extension;

void createExtension(const ActionsDAG::Node * predicate);
void createExtension(const ActionsDAG::Node * predicate, const size_t number_of_replicas);
ContextPtr updateSettings(const Settings & settings);
};

Expand Down
18 changes: 10 additions & 8 deletions src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <Storages/ObjectStorage/Utils.h>
#include <Storages/ObjectStorage/StorageObjectStorageSource.h>
#include <Storages/extractTableFunctionArgumentsFromSelectQuery.h>
#include <Storages/ObjectStorage/StorageObjectStorageStableTaskDistributor.h>


namespace DB
{
Expand Down Expand Up @@ -279,19 +281,19 @@ void StorageObjectStorageCluster::updateQueryToSendIfNeeded(
}

RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExtension(
const ActionsDAG::Node * predicate, const ContextPtr & local_context) const
const ActionsDAG::Node * predicate, const ContextPtr & local_context, const size_t number_of_replicas) const
{
auto iterator = StorageObjectStorageSource::createFileIterator(
configuration, configuration->getQuerySettings(local_context), object_storage, /* distributed_processing */false,
local_context, predicate, getVirtualsList(), nullptr, local_context->getFileProgressCallback());

auto callback = std::make_shared<std::function<String()>>([iterator]() mutable -> String
{
auto object_info = iterator->next(0);
if (object_info)
return object_info->getPath();
return "";
});
auto task_distributor = std::make_shared<StorageObjectStorageStableTaskDistributor>(iterator, number_of_replicas);

auto callback = std::make_shared<TaskIterator>(
[task_distributor](size_t number_of_current_replica) mutable -> String {
return task_distributor->getNextTask(number_of_current_replica).value_or("");
});

return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) };
}

Expand Down
2 changes: 1 addition & 1 deletion src/Storages/ObjectStorage/StorageObjectStorageCluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class StorageObjectStorageCluster : public IStorageCluster
std::string getName() const override;

RemoteQueryExecutor::Extension getTaskIteratorExtension(
const ActionsDAG::Node * predicate, const ContextPtr & context) const override;
const ActionsDAG::Node * predicate, const ContextPtr & context, size_t number_of_replicas) const override;

String getPathSample(StorageInMemoryMetadata metadata, ContextPtr context);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#include "StorageObjectStorageStableTaskDistributor.h"
#include <Common/SipHash.h>
#include <consistent_hashing.h>
#include <optional>

namespace DB
{

StorageObjectStorageStableTaskDistributor::StorageObjectStorageStableTaskDistributor(
std::shared_ptr<IObjectIterator> iterator_,
size_t number_of_replicas_)
: iterator(std::move(iterator_))
, connection_to_files(number_of_replicas_)
, iterator_exhausted(false)
{
}

std::optional<String> StorageObjectStorageStableTaskDistributor::getNextTask(size_t number_of_current_replica)
{
LOG_TRACE(
log,
"Received a new connection from replica {} looking for a file",
number_of_current_replica
);

// 1. Check pre-queued files first
if (auto file = getPreQueuedFile(number_of_current_replica))
return file;

// 2. Try to find a matching file from the iterator
if (auto file = getMatchingFileFromIterator(number_of_current_replica))
return file;

// 3. Process unprocessed files if iterator is exhausted
return getAnyUnprocessedFile(number_of_current_replica);
}

size_t StorageObjectStorageStableTaskDistributor::getReplicaForFile(const String & file_path)
{
return ConsistentHashing(sipHash64(file_path), connection_to_files.size());
}

std::optional<String> StorageObjectStorageStableTaskDistributor::getPreQueuedFile(size_t number_of_current_replica)
{
std::lock_guard lock(mutex);

auto & files = connection_to_files[number_of_current_replica];

while (!files.empty())
{
String next_file = files.back();
files.pop_back();

auto it = unprocessed_files.find(next_file);
if (it == unprocessed_files.end())
continue;

unprocessed_files.erase(it);

LOG_TRACE(
log,
"Assigning pre-queued file {} to replica {}",
next_file,
number_of_current_replica
);

return next_file;
}

return std::nullopt;
}

std::optional<String> StorageObjectStorageStableTaskDistributor::getMatchingFileFromIterator(size_t number_of_current_replica)
{
{
std::lock_guard lock(mutex);
if (iterator_exhausted)
return std::nullopt;
}

while (true)
{
ObjectInfoPtr object_info;

{
std::lock_guard lock(mutex);
object_info = iterator->next(0);

if (!object_info)
{
iterator_exhausted = true;
break;
}
}

String file_path;

auto archive_object_info = std::dynamic_pointer_cast<StorageObjectStorageSource::ArchiveIterator::ObjectInfoInArchive>(object_info);
if (archive_object_info)
{
file_path = archive_object_info->getPathToArchive();
}
else
{
file_path = object_info->getPath();
}

size_t file_replica_idx = getReplicaForFile(file_path);
if (file_replica_idx == number_of_current_replica)
{
LOG_TRACE(
log,
"Found file {} for replica {}",
file_path,
number_of_current_replica
);

return file_path;
}

// Queue file for its assigned replica
{
std::lock_guard lock(mutex);
unprocessed_files.insert(file_path);
connection_to_files[file_replica_idx].push_back(file_path);
}
}

return std::nullopt;
}

std::optional<String> StorageObjectStorageStableTaskDistributor::getAnyUnprocessedFile(size_t number_of_current_replica)
{
std::lock_guard lock(mutex);

if (!unprocessed_files.empty())
{
auto it = unprocessed_files.begin();
String next_file = *it;
unprocessed_files.erase(it);

LOG_TRACE(
log,
"Iterator exhausted. Assigning unprocessed file {} to replica {}",
next_file,
number_of_current_replica
);

return next_file;
}

return std::nullopt;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#pragma once

#include <Client/Connection.h>
#include <Common/Logger.h>
#include <Interpreters/Cluster.h>
#include <Storages/ObjectStorage/StorageObjectStorageSource.h>
#include <Storages/ObjectStorageQueue/ObjectStorageQueueSource.h>
#include <unordered_set>
#include <vector>
#include <mutex>
#include <memory>

namespace DB
{

class StorageObjectStorageStableTaskDistributor
{
public:
using IObjectIterator = StorageObjectStorageSource::IIterator;
using ObjectInfoPtr = StorageObjectStorage::ObjectInfoPtr;

StorageObjectStorageStableTaskDistributor(
std::shared_ptr<IObjectIterator> iterator_,
size_t number_of_replicas_);

std::optional<String> getNextTask(size_t number_of_current_replica);

private:
size_t getReplicaForFile(const String & file_path);
std::optional<String> getPreQueuedFile(size_t number_of_current_replica);
std::optional<String> getMatchingFileFromIterator(size_t number_of_current_replica);
std::optional<String> getAnyUnprocessedFile(size_t number_of_current_replica);

std::shared_ptr<IObjectIterator> iterator;

std::vector<std::vector<String>> connection_to_files;
std::unordered_set<String> unprocessed_files;

std::mutex mutex;
bool iterator_exhausted = false;

LoggerPtr log = getLogger("StorageClusterTaskDistributor");
};

}
Loading
Loading