From bcc382bf6a5e23001c69350faff1b92acc1da0b4 Mon Sep 17 00:00:00 2001 From: DuckDB Labs GitHub Bot Date: Fri, 16 Jan 2026 05:34:31 +0000 Subject: [PATCH] Update vendored DuckDB sources to 431ad092c9 --- src/duckdb/extension/icu/icu-makedate.cpp | 9 +- .../src/common/encryption_functions.cpp | 11 +- .../src/common/encryption_key_manager.cpp | 16 +- src/duckdb/src/common/error_data.cpp | 36 +-- src/duckdb/src/common/hive_partitioning.cpp | 27 +-- .../expression_executor/execute_function.cpp | 12 + .../expression_executor/execute_operator.cpp | 7 + .../execution/expression_executor_state.cpp | 6 + .../src/function/pragma/pragma_queries.cpp | 2 +- .../function/table/version/pragma_version.cpp | 6 +- .../duckdb/common/encryption_key_manager.hpp | 6 + .../execution/expression_executor_state.hpp | 5 + .../include/duckdb/main/extension_entries.hpp | 2 + .../expression_binder/having_binder.hpp | 1 + .../table_function_binder.hpp | 10 + .../planner/binder/statement/bind_copy.cpp | 11 + .../planner/binder/statement/bind_insert.cpp | 5 +- .../expression_binder/having_binder.cpp | 4 + .../table_function_binder.cpp | 8 +- .../compression/validity_uncompressed.cpp | 221 +++++++++++------- 20 files changed, 268 insertions(+), 137 deletions(-) diff --git a/src/duckdb/extension/icu/icu-makedate.cpp b/src/duckdb/extension/icu/icu-makedate.cpp index 7c8efb2cb..2284f35cb 100644 --- a/src/duckdb/extension/icu/icu-makedate.cpp +++ b/src/duckdb/extension/icu/icu-makedate.cpp @@ -2,13 +2,12 @@ #include "duckdb/common/operator/cast_operators.hpp" #include "duckdb/common/operator/subtract.hpp" #include "duckdb/common/types/date.hpp" -#include "duckdb/common/types/time.hpp" #include "duckdb/common/types/timestamp.hpp" #include "duckdb/common/vector_operations/senary_executor.hpp" #include "duckdb/common/vector_operations/septenary_executor.hpp" #include "duckdb/function/cast/cast_function_set.hpp" #include "duckdb/main/extension/extension_loader.hpp" -#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp" +#include "duckdb/main/settings.hpp" #include "include/icu-casts.hpp" #include "include/icu-datefunc.hpp" #include "include/icu-datetrunc.hpp" @@ -57,6 +56,10 @@ BoundCastInfo ICUMakeDate::BindCastToDate(BindCastInput &input, const LogicalTyp if (!input.context) { throw InternalException("Missing context for TIMESTAMPTZ to DATE cast."); } + if (DBConfig::GetSetting(*input.context)) { + throw BinderException("Casting from TIMESTAMP WITH TIME ZONE to DATE without an explicit time zone " + "has been disabled - use \"AT TIME ZONE ...\""); + } auto cast_data = make_uniq(make_uniq(*input.context)); @@ -80,7 +83,7 @@ struct ICUMakeTimestampTZFunc : public ICUDateFunc { ss -= secs; ss *= Interval::MSECS_PER_SEC; const auto millis = int32_t(ss); - int64_t micros = std::round((ss - millis) * Interval::MICROS_PER_MSEC); + int64_t micros = LossyNumericCast(std::round((ss - millis) * Interval::MICROS_PER_MSEC)); calendar->set(UCAL_YEAR, year); calendar->set(UCAL_MONTH, month); diff --git a/src/duckdb/src/common/encryption_functions.cpp b/src/duckdb/src/common/encryption_functions.cpp index 1ecf1abeb..b6c2a9576 100644 --- a/src/duckdb/src/common/encryption_functions.cpp +++ b/src/duckdb/src/common/encryption_functions.cpp @@ -33,6 +33,9 @@ idx_t EncryptionNonce::size() const { EncryptionEngine::EncryptionEngine() { } +EncryptionEngine::~EncryptionEngine() { +} + const_data_ptr_t EncryptionEngine::GetKeyFromCache(DatabaseInstance &db, const string &key_name) { auto &keys = EncryptionKeyManager::Get(db); return keys.GetKey(key_name); @@ -48,8 +51,8 @@ void EncryptionEngine::AddKeyToCache(DatabaseInstance &db, data_ptr_t key, const if (!keys.HasKey(key_name)) { keys.AddKey(key_name, key); } else { - // wipe out the key - std::memset(key, 0, MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); + duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, + MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); } } @@ -60,8 +63,8 @@ string EncryptionEngine::AddKeyToCache(DatabaseInstance &db, data_ptr_t key) { if (!keys.HasKey(key_id)) { keys.AddKey(key_id, key); } else { - // wipe out the original key - std::memset(key, 0, MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); + duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, + MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); } return key_id; diff --git a/src/duckdb/src/common/encryption_key_manager.cpp b/src/duckdb/src/common/encryption_key_manager.cpp index e20f2208b..902b215c9 100644 --- a/src/duckdb/src/common/encryption_key_manager.cpp +++ b/src/duckdb/src/common/encryption_key_manager.cpp @@ -74,7 +74,7 @@ string EncryptionKeyManager::GenerateRandomKeyID() { void EncryptionKeyManager::AddKey(const string &key_name, data_ptr_t key) { lock_guard guard(lock); derived_keys.emplace(key_name, EncryptionKey(key)); - // Zero-out the encryption key + // Zero-out the input encryption key duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key, DERIVED_KEY_LENGTH); } @@ -91,6 +91,19 @@ const_data_ptr_t EncryptionKeyManager::GetKey(const string &key_name) const { void EncryptionKeyManager::DeleteKey(const string &key_name) { lock_guard guard(lock); + ClearKey(key_name); + EraseKey(key_name); +} + +void EncryptionKeyManager::ClearKey(const string &key_name) { + D_ASSERT(HasKey(key_name)); + auto const key_data = derived_keys.at(key_name).GetData(); + // clear the key (zero-out its memory) + duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(key_data, + MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); +} + +void EncryptionKeyManager::EraseKey(const string &key_name) { derived_keys.erase(key_name); } @@ -130,6 +143,7 @@ void EncryptionKeyManager::DeriveKey(string &user_key, data_ptr_t salt, data_ptr KeyDerivationFunctionSHA256(reinterpret_cast(decoded_key.data()), decoded_key.size(), salt, derived_key); + duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(data_ptr_cast(&user_key[0]), user_key.size()); duckdb_mbedtls::MbedTlsWrapper::AESStateMBEDTLS::SecureClearData(data_ptr_cast(&decoded_key[0]), decoded_key.size()); diff --git a/src/duckdb/src/common/error_data.cpp b/src/duckdb/src/common/error_data.cpp index 2ddf94af6..51b26e417 100644 --- a/src/duckdb/src/common/error_data.cpp +++ b/src/duckdb/src/common/error_data.cpp @@ -18,36 +18,40 @@ ErrorData::ErrorData(const std::exception &ex) : ErrorData(ex.what()) { } ErrorData::ErrorData(ExceptionType type, const string &message) - : initialized(true), type(type), raw_message(SanitizeErrorMessage(message)), - final_message(ConstructFinalMessage()) { + : initialized(true), type(type), raw_message(SanitizeErrorMessage(message)) { + // In the case of ExceptionType::INTERNAL, the stack trace is part of the final message. + // To construct it, we need to access extra_info, which has to be initialized first. + // Thus, we only set final_message in the constructor's body. + final_message = ConstructFinalMessage(); } ErrorData::ErrorData(const string &message) : initialized(true), type(ExceptionType::INVALID), raw_message(string()), final_message(string()) { - // parse the constructed JSON if (message.empty() || message[0] != '{') { - // not JSON! Use the message as a raw Exception message and leave type as uninitialized - + // Not a JSON-formatted message. + // Use the message as a raw Exception message and leave the type as uninitialized. if (message == std::bad_alloc().what()) { type = ExceptionType::OUT_OF_MEMORY; raw_message = "Allocation failure"; } else { raw_message = message; } - } else { - auto info = StringUtil::ParseJSONMap(message)->Flatten(); - for (auto &entry : info) { - if (entry.first == "exception_type") { - type = Exception::StringToExceptionType(entry.second); - } else if (entry.first == "exception_message") { - raw_message = SanitizeErrorMessage(entry.second); - } else { - extra_info[entry.first] = entry.second; - } - } + final_message = ConstructFinalMessage(); + return; } + // JSON-formatted message. + auto info = StringUtil::ParseJSONMap(message)->Flatten(); + for (auto &entry : info) { + if (entry.first == "exception_type") { + type = Exception::StringToExceptionType(entry.second); + } else if (entry.first == "exception_message") { + raw_message = SanitizeErrorMessage(entry.second); + } else { + extra_info[entry.first] = entry.second; + } + } final_message = ConstructFinalMessage(); } diff --git a/src/duckdb/src/common/hive_partitioning.cpp b/src/duckdb/src/common/hive_partitioning.cpp index 932943b8f..1a0647d2c 100644 --- a/src/duckdb/src/common/hive_partitioning.cpp +++ b/src/duckdb/src/common/hive_partitioning.cpp @@ -245,26 +245,13 @@ static void TemplatedGetHivePartitionValues(Vector &input, vector(data[sel.get_index(0)]).GetTypeMutable() != type; - if (reinterpret) { - for (idx_t i = 0; i < count; i++) { - auto &key = keys[i]; - const auto idx = sel.get_index(i); - if (validity.RowIsValid(idx)) { - key.values[col_idx] = GetHiveKeyValue(data[idx], type); - } else { - key.values[col_idx] = GetHiveKeyNullValue(type); - } - } - } else { - for (idx_t i = 0; i < count; i++) { - auto &key = keys[i]; - const auto idx = sel.get_index(i); - if (validity.RowIsValid(idx)) { - key.values[col_idx] = GetHiveKeyValue(data[idx]); - } else { - key.values[col_idx] = GetHiveKeyNullValue(type); - } + for (idx_t i = 0; i < count; i++) { + auto &key = keys[i]; + const auto idx = sel.get_index(i); + if (validity.RowIsValid(idx)) { + key.values[col_idx] = GetHiveKeyValue(data[idx], type); + } else { + key.values[col_idx] = GetHiveKeyNullValue(type); } } } diff --git a/src/duckdb/src/execution/expression_executor/execute_function.cpp b/src/duckdb/src/execution/expression_executor/execute_function.cpp index a7e99287b..84350e76b 100644 --- a/src/duckdb/src/execution/expression_executor/execute_function.cpp +++ b/src/duckdb/src/execution/expression_executor/execute_function.cpp @@ -119,6 +119,18 @@ bool ExecuteFunctionState::TryExecuteDictionaryExpression(const BoundFunctionExp return true; } +void ExecuteFunctionState::ResetDictionaryStates() { + + // Clear the cached dictionary information + current_input_dictionary_id.clear(); + output_dictionary_id.clear(); + output_dictionary.reset(); + + for (const auto &child_state : child_states) { + child_state->ResetDictionaryStates(); + } +} + unique_ptr ExpressionExecutor::InitializeState(const BoundFunctionExpression &expr, ExpressionExecutorState &root) { auto result = make_uniq(expr, root); diff --git a/src/duckdb/src/execution/expression_executor/execute_operator.cpp b/src/duckdb/src/execution/expression_executor/execute_operator.cpp index 04883c5de..551477270 100644 --- a/src/duckdb/src/execution/expression_executor/execute_operator.cpp +++ b/src/duckdb/src/execution/expression_executor/execute_operator.cpp @@ -123,12 +123,19 @@ void ExpressionExecutor::Execute(const BoundOperatorExpression &expr, Expression throw; } } + + // On error, evaluate per row SelectionVector selvec(1); DataChunk intermediate; intermediate.Initialize(GetAllocator(), {result.GetType()}, 1); for (idx_t i = 0; i < count; i++) { intermediate.Reset(); intermediate.SetCardinality(1); + + // Make sure to clear any dictionary states in the child expression, so that it actually + // gets executed anew for every row + child_state.ResetDictionaryStates(); + selvec.set_index(0, sel ? sel->get_index(i) : i); Value val(result.GetType()); try { diff --git a/src/duckdb/src/execution/expression_executor_state.cpp b/src/duckdb/src/execution/expression_executor_state.cpp index 070a399db..fcac6065b 100644 --- a/src/duckdb/src/execution/expression_executor_state.cpp +++ b/src/duckdb/src/execution/expression_executor_state.cpp @@ -52,6 +52,12 @@ void ExpressionState::Verify(ExpressionExecutorState &root_executor) { } } +void ExpressionState::ResetDictionaryStates() { + for (const auto &child : child_states) { + child->ResetDictionaryStates(); + } +} + void ExpressionExecutorState::Verify() { D_ASSERT(executor); root_state->Verify(*this); diff --git a/src/duckdb/src/function/pragma/pragma_queries.cpp b/src/duckdb/src/function/pragma/pragma_queries.cpp index 9107b8c01..3cf8f0f7d 100644 --- a/src/duckdb/src/function/pragma/pragma_queries.cpp +++ b/src/duckdb/src/function/pragma/pragma_queries.cpp @@ -201,7 +201,7 @@ static string PragmaDatabaseSize(ClientContext &context, const FunctionParameter } static string PragmaStorageInfo(ClientContext &context, const FunctionParameters ¶meters) { - return StringUtil::Format("SELECT * FROM pragma_storage_info('%s');", parameters.values[0].ToString()); + return StringUtil::Format("SELECT * FROM pragma_storage_info(%s);", SQLString(parameters.values[0].ToString())); } static string PragmaMetadataInfo(ClientContext &context, const FunctionParameters ¶meters) { diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index f78634288..22c2aa9b9 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "4-dev175" +#define DUCKDB_PATCH_VERSION "4-dev252" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 4 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.4.4-dev175" +#define DUCKDB_VERSION "v1.4.4-dev252" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "a56ccd8040" +#define DUCKDB_SOURCE_ID "431ad092c9" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp b/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp index fa256eab6..31bb7223b 100644 --- a/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +++ b/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp @@ -33,6 +33,10 @@ class EncryptionKey { return key; } + data_ptr_t GetData() { + return key; + } + public: static void LockEncryptionKey(data_ptr_t key, idx_t key_len = MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); static void UnlockEncryptionKey(data_ptr_t key, idx_t key_len = MainHeader::DEFAULT_ENCRYPTION_KEY_LENGTH); @@ -52,6 +56,8 @@ class EncryptionKeyManager : public ObjectCacheEntry { void AddKey(const string &key_name, data_ptr_t key); bool HasKey(const string &key_name) const; void DeleteKey(const string &key_name); + void ClearKey(const string &key_name); + void EraseKey(const string &key_name); const_data_ptr_t GetKey(const string &key_name) const; public: diff --git a/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp b/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp index 8f0b77ccf..772daa88b 100644 --- a/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp +++ b/src/duckdb/src/include/duckdb/execution/expression_executor_state.hpp @@ -41,6 +41,9 @@ struct ExpressionState { void Verify(ExpressionExecutorState &root); + //! Reset any cached dictionary expression states in this expression state and its children + virtual void ResetDictionaryStates(); + public: template TARGET &Cast() { @@ -67,6 +70,8 @@ struct ExecuteFunctionState : public ExpressionState { bool TryExecuteDictionaryExpression(const BoundFunctionExpression &expr, DataChunk &args, ExpressionState &state, Vector &result); + void ResetDictionaryStates() override; + public: unique_ptr local_state; diff --git a/src/duckdb/src/include/duckdb/main/extension_entries.hpp b/src/duckdb/src/include/duckdb/main/extension_entries.hpp index 1d5a8510e..85fbcbcd2 100644 --- a/src/duckdb/src/include/duckdb/main/extension_entries.hpp +++ b/src/duckdb/src/include/duckdb/main/extension_entries.hpp @@ -622,6 +622,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"st_hasm", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_hasz", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_hilbert", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"st_interiorringn", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_interpolatepoint", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_intersection", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_intersection_agg", "spatial", CatalogType::AGGREGATE_FUNCTION_ENTRY}, @@ -1034,6 +1035,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"ducklake_retry_wait_ms", "ducklake"}, {"enable_curl_server_cert_verification", "httpfs"}, {"enable_geoparquet_conversion", "parquet"}, + {"enable_global_s3_configuration", "httpfs"}, {"enable_server_cert_verification", "httpfs"}, {"force_download", "httpfs"}, {"hf_max_per_page", "httpfs"}, diff --git a/src/duckdb/src/include/duckdb/planner/expression_binder/having_binder.hpp b/src/duckdb/src/include/duckdb/planner/expression_binder/having_binder.hpp index b111cc368..cda7d0789 100644 --- a/src/duckdb/src/include/duckdb/planner/expression_binder/having_binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression_binder/having_binder.hpp @@ -26,6 +26,7 @@ class HavingBinder : public BaseSelectBinder { BindResult BindColumnRef(unique_ptr &expr_ptr, idx_t depth, bool root_expression) override; unique_ptr QualifyColumnName(ColumnRefExpression &col_ref, ErrorData &error) override; + bool QualifyColumnAlias(const ColumnRefExpression &colref) override; private: ColumnAliasBinder column_alias_binder; diff --git a/src/duckdb/src/include/duckdb/planner/expression_binder/table_function_binder.hpp b/src/duckdb/src/include/duckdb/planner/expression_binder/table_function_binder.hpp index cc20dd5ed..a86c2ed45 100644 --- a/src/duckdb/src/include/duckdb/planner/expression_binder/table_function_binder.hpp +++ b/src/duckdb/src/include/duckdb/planner/expression_binder/table_function_binder.hpp @@ -18,6 +18,14 @@ class TableFunctionBinder : public ExpressionBinder { TableFunctionBinder(Binder &binder, ClientContext &context, string table_function_name = string(), string clause = "Table function"); +public: + void DisableSQLValueFunctions() { + accept_sql_value_functions = false; + } + void EnableSQLValueFunctions() { + accept_sql_value_functions = true; + } + protected: BindResult BindLambdaReference(LambdaRefExpression &expr, idx_t depth); BindResult BindColumnReference(unique_ptr &expr, idx_t depth, bool root_expression); @@ -28,6 +36,8 @@ class TableFunctionBinder : public ExpressionBinder { private: string table_function_name; string clause; + //! Whether sql_value_functions (GetSQLValueFunctionName) are considered when binding column refs + bool accept_sql_value_functions = true; }; } // namespace duckdb diff --git a/src/duckdb/src/planner/binder/statement/bind_copy.cpp b/src/duckdb/src/planner/binder/statement/bind_copy.cpp index 10bfbdc7b..212817acb 100644 --- a/src/duckdb/src/planner/binder/statement/bind_copy.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_copy.cpp @@ -422,10 +422,21 @@ vector BindCopyOption(ClientContext &context, TableFunctionBinder &option return result; } } + const bool is_partition_by = StringUtil::CIEquals(name, "partition_by"); + + if (is_partition_by) { + //! When binding the 'partition_by' option, we don't want to resolve a column reference to a SQLValueFunction + //! (like 'user') + option_binder.DisableSQLValueFunctions(); + } auto bound_expr = option_binder.Bind(expr); if (bound_expr->HasParameter()) { throw ParameterNotResolvedException(); } + if (is_partition_by) { + option_binder.EnableSQLValueFunctions(); + } + auto val = ExpressionExecutor::EvaluateScalar(context, *bound_expr, true); if (val.IsNull()) { throw BinderException("NULL is not supported as a valid option for COPY option \"" + name + "\""); diff --git a/src/duckdb/src/planner/binder/statement/bind_insert.cpp b/src/duckdb/src/planner/binder/statement/bind_insert.cpp index 96222f0c5..0c4b80192 100644 --- a/src/duckdb/src/planner/binder/statement/bind_insert.cpp +++ b/src/duckdb/src/planner/binder/statement/bind_insert.cpp @@ -392,9 +392,10 @@ unique_ptr Binder::GenerateMergeInto(InsertStatement &stmt, named_column_map.push_back(col.Logical()); } } else { + // Ensure that the columns are valid. for (auto &col_name : stmt.columns) { - auto &col = table.GetColumn(col_name); - named_column_map.push_back(col.Logical()); + auto col_idx = table.GetColumnIndex(col_name); + named_column_map.push_back(col_idx); } } ExpandDefaultInValuesList(stmt, table, values_list, named_column_map); diff --git a/src/duckdb/src/planner/expression_binder/having_binder.cpp b/src/duckdb/src/planner/expression_binder/having_binder.cpp index 48ee194ff..562610bba 100644 --- a/src/duckdb/src/planner/expression_binder/having_binder.cpp +++ b/src/duckdb/src/planner/expression_binder/having_binder.cpp @@ -93,4 +93,8 @@ BindResult HavingBinder::BindWindow(WindowExpression &expr, idx_t depth) { throw BinderException::Unsupported(expr, "HAVING clause cannot contain window functions!"); } +bool HavingBinder::QualifyColumnAlias(const ColumnRefExpression &colref) { + return column_alias_binder.QualifyColumnAlias(colref); +} + } // namespace duckdb diff --git a/src/duckdb/src/planner/expression_binder/table_function_binder.cpp b/src/duckdb/src/planner/expression_binder/table_function_binder.cpp index 720dbe37d..8bdd193e7 100644 --- a/src/duckdb/src/planner/expression_binder/table_function_binder.cpp +++ b/src/duckdb/src/planner/expression_binder/table_function_binder.cpp @@ -51,9 +51,11 @@ BindResult TableFunctionBinder::BindColumnReference(unique_ptr } } - auto value_function = ExpressionBinder::GetSQLValueFunction(column_names.back()); - if (value_function) { - return BindExpression(value_function, depth, root_expression); + if (accept_sql_value_functions) { + auto value_function = ExpressionBinder::GetSQLValueFunction(column_names.back()); + if (value_function) { + return BindExpression(value_function, depth, root_expression); + } } if (table_function_name.empty()) { throw BinderException(query_location, diff --git a/src/duckdb/src/storage/compression/validity_uncompressed.cpp b/src/duckdb/src/storage/compression/validity_uncompressed.cpp index 4fa65f32e..20bcb0ffe 100644 --- a/src/duckdb/src/storage/compression/validity_uncompressed.cpp +++ b/src/duckdb/src/storage/compression/validity_uncompressed.cpp @@ -230,7 +230,16 @@ void ValidityUncompressed::UnalignedScan(data_ptr_t input, idx_t input_size, idx for (idx_t i = 0; i < scan_count; i++) { D_ASSERT(result_mask.RowIsValid(result_offset + i)); } + // save boundary entries to verify we don't corrupt surrounding bits later. + idx_t debug_first_entry = result_offset / ValidityMask::BITS_PER_VALUE; + idx_t debug_last_entry = (result_offset + scan_count - 1) / ValidityMask::BITS_PER_VALUE; + auto debug_result_data = (validity_t *)result_mask.GetData(); + validity_t debug_original_first_entry = + debug_result_data ? debug_result_data[debug_first_entry] : ValidityMask::ValidityBuffer::MAX_ENTRY; + validity_t debug_original_last_entry = + debug_result_data ? debug_result_data[debug_last_entry] : ValidityMask::ValidityBuffer::MAX_ENTRY; #endif + #if STANDARD_VECTOR_SIZE < 128 // fallback for tiny vector sizes // the bitwise ops we use below don't work if the vector size is too small @@ -256,108 +265,135 @@ void ValidityUncompressed::UnalignedScan(data_ptr_t input, idx_t input_size, idx idx_t input_entry = input_start / ValidityMask::BITS_PER_VALUE; idx_t input_idx = input_start - input_entry * ValidityMask::BITS_PER_VALUE; + // Window scanning algorithm -- the goal is to copy a contiguous sequence of bits from input into result, + // and to do this using bit operations on 64 bit fields. + // + // On each loop iteration, we are inspecting a 64 bit field in both the input and result, starting at a certain + // index (in the code, these are denoted by input(result)_entry and input(result)_index, respectively. + // + // For example, on the first loop iteration for the diagram, both entries are entry 0, and the starting indexes are + // the index of window 1 in each entry. + // + // input(result)_window is the window from input(result)_index to the end of either the current bit field, or + // the end of the range of bits we are trying to copy if that is contained within the current entry. + // + // window is minimum(input_window, result_window), which is window 1 on the first iteration, window 2 on the + // second iteration, etc. These are what are shown in the diagram below. + // + // INPUT: + // 0 63| 127| 191 + // +-------------------------------+--------------------------------+--------------------------------+ + // | [ 1 ]|[ 2 ][ 3 ]|[ 4 ][ 5 ]| + // +-------------------------------+--------------------------------+--------------------------------+ + // + // RESULT: + // 0 63| 127| 191 + // +-------------------------------+--------------------------------+--------------------------------+ + // [ 1 ][ 2 ]|[ 3 ][ 4 ]|[ 5 ] | + // +-------------------------------+--------------------------------+--------------------------------+ + // + // Note: in case this ever becomes a bottleneck, it should be possible to make each loop iteration branchless. + // The idea would be to do an odd iteration before the loop, then have two loops depending on the layout of the + // windows that will either shift left then right on each iteration, or the other loop will always shift right + // then left. For example, in the diagram above, we would first apply the first window outside of the loop + // beforehand, then we can see that each loop iteration requires us to shift right, fetch a new result entry, + // shift left, fetch a new input entry. This would have to be generalized to two possible branchless loops, + // depending on the input. + // now start the bit games idx_t pos = 0; while (pos < scan_count) { - // these are the current validity entries we are dealing with - idx_t current_result_idx = result_entry; - idx_t offset; validity_t input_mask = input_data[input_entry]; + idx_t bits_left = scan_count - pos; - // construct the mask to AND together with the result - if (result_idx < input_idx) { - // +======================================+ - // input: |xxxxxxxxx| | - // +======================================+ - // - // +======================================+ - // result: | xxxxxxxxx| | - // +======================================+ - // 1. We shift (>>) 'input' to line up with 'result' - // 2. We set the bits we shifted to 1 + // these are bits left within the current entries (possibly extra than what we need). + idx_t input_bits_left = ValidityMask::BITS_PER_VALUE - input_idx; + idx_t result_bits_left = ValidityMask::BITS_PER_VALUE - result_idx; - // we have to shift the input RIGHT if the result_idx is smaller than the input_idx - auto shift_amount = input_idx - result_idx; - D_ASSERT(shift_amount > 0 && shift_amount <= ValidityMask::BITS_PER_VALUE); + // these are the bits left within the current entries that need to be processed. + idx_t input_window_size = MinValue(bits_left, input_bits_left); + idx_t result_window_size = MinValue(bits_left, result_bits_left); - input_mask = input_mask >> shift_amount; - - // now the upper "shift_amount" bits are set to 0 - // we need them to be set to 1 - // otherwise the subsequent bitwise & will modify values outside of the range of values we want to alter - input_mask |= ValidityUncompressed::UPPER_MASKS[shift_amount]; + // the smaller of the two is our next window to copy from input to result. + idx_t window_size = MinValue(input_window_size, result_window_size); - if (pos == 0) { - // We also need to set the lower bits, which are to the left of the relevant bits (x), to 1 - // These are the bits that are "behind" this scan window, and should not affect this scan - auto non_relevant_mask = ValidityUncompressed::LOWER_MASKS[result_idx]; - input_mask |= non_relevant_mask; - } + // Now within each loop iteration, we can think of the general case that handles all scenarios as just + // copying the window from the starting index in input to the window in the starting index of result. - // after this, we move to the next input_entry - offset = ValidityMask::BITS_PER_VALUE - input_idx; - input_entry++; - input_idx = 0; - result_idx += offset; - } else if (result_idx > input_idx) { - // +======================================+ - // input: | xxxxxxxxx| | - // +======================================+ + // First, line up the windows: + if (result_idx < input_idx) { + // X is arbitrary bits, P is arbitrary protected bits. + // INPUT ENTRY: + // 63 0 + // +--------------------------------------------------------------------------------------------------+ + // |XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX[=============WINDOW=============]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| + // +--------------------------------------------------------------------------------------------------+ + // ^ + // input_idx // - // +======================================+ - // result: |xxxxxxxxx| | - // +======================================+ - // 1. We set the bits to the left of the relevant bits (x) to 0 - // 1. We shift (<<) 'input' to line up with 'result' - // 2. We set the bits that we zeroed to the right of the relevant bits (x) to 1 - - // we have to shift the input LEFT if the result_idx is bigger than the input_idx - auto shift_amount = result_idx - input_idx; - D_ASSERT(shift_amount > 0 && shift_amount <= ValidityMask::BITS_PER_VALUE); - - // to avoid overflows, we set the upper "shift_amount" values to 0 first - input_mask = (input_mask & ~ValidityUncompressed::UPPER_MASKS[shift_amount]) << shift_amount; - - // now the lower "shift_amount" bits are set to 0 - // we need them to be set to 1 - // otherwise the subsequent bitwise & will modify values outside of the range of values we want to alter - input_mask |= ValidityUncompressed::LOWER_MASKS[shift_amount]; - - // after this, we move to the next result_entry - offset = ValidityMask::BITS_PER_VALUE - result_idx; - result_entry++; - result_idx = 0; - input_idx += offset; - } else { - // if the input_idx is equal to result_idx they are already aligned - // we just move to the next entry for both after this - offset = ValidityMask::BITS_PER_VALUE - result_idx; - input_entry++; - result_entry++; - result_idx = input_idx = 0; - } - // now we need to check if we should include the ENTIRE mask - // OR if we need to mask from the right side - pos += offset; - if (pos > scan_count) { - // +======================================+ - // mask: | |xxxxxxxxxxxxxxxxxxxxxxxxx| - // +======================================+ + // RESULT ENTRY: + // 63 0 + // +--------------------------------------------------------------------------------------------------+ + // |PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP[=============WINDOW=============]PPPPPPPPPPPPPPPPPPPPPP| + // +--------------------------------------------------------------------------------------------------+ + // ^ + // result_idx // - // The bits on the right side of the relevant bits (x) need to stay 1, to be adjusted by later scans - // so we adjust the mask to clear out any 0s that might be present on the right side. + idx_t shift_amount = input_idx - result_idx; + input_mask = input_mask >> shift_amount; + } else { + // current_result_idx >= current_input_idx + idx_t shift_amount = result_idx - input_idx; + input_mask = (input_mask & ~UPPER_MASKS[shift_amount]); - // we need to set any bits that are past the scan_count on the right-side to 1 - // this is required so we don't influence any bits that are not part of the scan - input_mask |= ValidityUncompressed::UPPER_MASKS[pos - scan_count]; + // X is arbitrary bits, P is arbitrary protected bits. + // Note the zeroed out bits in INPUT_ENTRY - these have to be zeroed before shifting left to align with + // result window, to prevent overflow. + // + // INPUT ENTRY: + // 63 0 + // +--------------------------------------------------------------------------------------------------+ + // |000000000000XXXXXXXXXXXXXXXXXXXX[=============WINDOW=============]XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX| + // +--------------------------------------------------------------------------------------------------+ + // ^ + // input_idx + // + // RESULT ENTRY: + // 63 0 + // +--------------------------------------------------------------------------------------------------+ + // |PPPPPPPPPPPPPPPPPPPP[=============WINDOW=============]PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP| + // +--------------------------------------------------------------------------------------------------+ + // ^ + // result_idx + input_mask = input_mask << shift_amount; } - // now finally we can merge the input mask with the result mask + + // Once the windows are aligned, mask the input to prevent overwriting protected bits in the result_mask. + auto protected_upper_bits = UPPER_MASKS[ValidityMask::BITS_PER_VALUE - result_idx - window_size]; + auto protected_lower_bits = LOWER_MASKS[result_idx]; + input_mask |= protected_upper_bits; + input_mask |= protected_lower_bits; + if (input_mask != ValidityMask::ValidityBuffer::MAX_ENTRY) { if (!result_data) { result_mask.Initialize(); result_data = (validity_t *)result_mask.GetData(); } - result_data[current_result_idx] &= input_mask; + result_data[result_entry] &= input_mask; + } + // Now update pos, entries, and indexes for the next iteration. + pos += window_size; + + // Windows can only go until the end of the current entry, so the mod can only wrap to 0 here. + input_idx = (input_idx + window_size) % ValidityMask::BITS_PER_VALUE; + result_idx = (result_idx + window_size) % ValidityMask::BITS_PER_VALUE; + + // Advance entries if the mod was 0. + if (input_idx == 0) { + input_entry++; + } + if (result_idx == 0) { + result_entry++; } } #endif @@ -368,6 +404,23 @@ void ValidityUncompressed::UnalignedScan(data_ptr_t input, idx_t input_size, idx for (idx_t i = 0; i < scan_count; i++) { D_ASSERT(result_mask.RowIsValid(result_offset + i) == input_mask.RowIsValid(input_start + i)); } + // verify surrounding bits weren't modified + auto debug_final_result_data = (validity_t *)result_mask.GetData(); + validity_t debug_final_first_entry = + debug_final_result_data ? debug_final_result_data[debug_first_entry] : ValidityMask::ValidityBuffer::MAX_ENTRY; + validity_t debug_final_last_entry = + debug_final_result_data ? debug_final_result_data[debug_last_entry] : ValidityMask::ValidityBuffer::MAX_ENTRY; + + idx_t first_bit_in_first_entry = result_offset % ValidityMask::BITS_PER_VALUE; + idx_t last_bit_in_last_entry = (result_offset + scan_count - 1) % ValidityMask::BITS_PER_VALUE; + + // lower bits of first entry should be unchanged + validity_t lower_mask = LOWER_MASKS[first_bit_in_first_entry]; + D_ASSERT((debug_original_first_entry & lower_mask) == (debug_final_first_entry & lower_mask)); + + // upper bits of last entry should be unchanged + validity_t upper_mask = UPPER_MASKS[ValidityMask::BITS_PER_VALUE - last_bit_in_last_entry - 1]; + D_ASSERT((debug_original_last_entry & upper_mask) == (debug_final_last_entry & upper_mask)); #endif }