From cb49eb52de0ab693df1a20e96f1fbb1986192211 Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Mon, 1 Dec 2025 17:04:44 +0300 Subject: [PATCH 1/6] fix(search): Fix numeric tree splits --- src/core/search/block_list.cc | 16 ++++++++------ src/core/search/block_list.h | 2 ++ src/core/search/range_tree.cc | 40 +++++++++++++++++++++-------------- src/core/search/range_tree.h | 28 ++++++++++++++++++------ 4 files changed, 58 insertions(+), 28 deletions(-) diff --git a/src/core/search/block_list.cc b/src/core/search/block_list.cc index 55213f424f35..ec74f5f3f212 100644 --- a/src/core/search/block_list.cc +++ b/src/core/search/block_list.cc @@ -40,15 +40,17 @@ SplitResult Split(BlockList>>&& block_list left.ReserveBlocks(block_list.blocks_.size() / 2 + 1); right.ReserveBlocks(block_list.blocks_.size() / 2 + 1); - double min_value_in_right_part = std::numeric_limits::infinity(); + double rmin = std::numeric_limits::infinity(); + double lmax = -std::numeric_limits::infinity(), rmax = lmax; + for (const Entry& entry : block_list) { if (entry.second < median_value) { left.PushBack(entry); + lmax = std::max(lmax, entry.second); } else if (entry.second > median_value) { right.PushBack(entry); - if (entry.second < min_value_in_right_part) { - min_value_in_right_part = entry.second; - } + rmin = std::min(rmin, entry.second); + rmax = std::max(rmax, entry.second); } else { median_entries.push_back(entry); } @@ -58,19 +60,21 @@ SplitResult Split(BlockList>>&& block_list if (left.Size() < right.Size()) { // If left is smaller, we can add median entries to it // We need to change median value to the right part - median_value = min_value_in_right_part; + median_value = rmin; + lmax = median_value; for (const auto& entry : median_entries) { left.Insert(entry); } } else { // If right part is smaller, we can add median entries to it // Median value is still the same + rmax = std::max(rmax, median_value); for (const auto& entry : median_entries) { right.Insert(entry); } } - return {std::move(left), std::move(right), median_value}; + return {std::move(left), std::move(right), median_value, lmax, rmax}; } template bool BlockList::Insert(ElementType t) { diff --git a/src/core/search/block_list.h b/src/core/search/block_list.h index 2fa433701381..fe9addcedfd2 100644 --- a/src/core/search/block_list.h +++ b/src/core/search/block_list.h @@ -218,5 +218,7 @@ struct SplitResult { Container left; Container right; double median; + + double lmax, rmax; }; } // namespace dfly::search diff --git a/src/core/search/range_tree.cc b/src/core/search/range_tree.cc index 61b9d881c679..9f5b3265cd6d 100644 --- a/src/core/search/range_tree.cc +++ b/src/core/search/range_tree.cc @@ -79,8 +79,7 @@ RangeTree::RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size, entries_(mr), enable_splitting_(enable_splitting) { entries_.emplace( - std::piecewise_construct, - std::forward_as_tuple(-std::numeric_limits::infinity()), + std::piecewise_construct, std::forward_as_tuple(-std::numeric_limits::infinity()), std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_)); } @@ -88,16 +87,19 @@ void RangeTree::Add(DocId id, double value) { DCHECK(std::isfinite(value)); auto it = FindRangeBlock(value); - RangeBlock& block = it->second; + auto& [lb, block] = *it; auto insert_result = block.Insert({id, value}); LOG_IF(ERROR, !insert_result) << "RangeTree: Failed to insert id: " << id << ", value: " << value; - if (!enable_splitting_ || block.Size() <= max_range_block_size_) { + if (!enable_splitting_ || block.Size() <= max_range_block_size_) + return; + + // Block consists just of a single value, not worth trying + if (lb == block.maxv) return; - } - SplitBlock(std::move(it)); + SplitBlock(it); } void RangeTree::Remove(DocId id, double value) { @@ -109,10 +111,16 @@ void RangeTree::Remove(DocId id, double value) { auto remove_result = block.Remove({id, value}); LOG_IF(ERROR, !remove_result) << "RangeTree: Failed to remove id: " << id << ", value: " << value; - // TODO: maybe merging blocks if they are too small - // The problem that for each mutable operation we do Remove and then Add, - // So we can do merge and split for one operation. - // Or in common cases users do not remove a lot of documents? + // Merge with left block if both are relatively small and won't be forced to split soon + if (block.size() < max_range_block_size_ / 4 && it != entries_.begin()) { + auto lit = --it; + auto& lblock = lit->second; + if (block.Size() + lblock.Size() < max_range_block_size_ / 2) { + for (auto e : block) + lblock.Insert(e); + entries_.erase(it); + } + } } RangeResult RangeTree::Range(double l, double r) const { @@ -221,11 +229,11 @@ TODO: we can optimize this case by splitting to three blocks: - empty right block with range [std::nextafter(m, +inf), r) */ void RangeTree::SplitBlock(Map::iterator it) { - const RangeNumber l = it->first; + const double l = it->first; auto split_result = Split(std::move(it->second)); - const RangeNumber m = split_result.median; + const double m = split_result.median; DCHECK(!split_result.right.Empty()); entries_.erase(it); @@ -234,11 +242,11 @@ void RangeTree::SplitBlock(Map::iterator it) { // If l == m, it means that all values in the block were equal to the median value // We can not insert an empty block with range [l, l) because it is not valid. entries_.emplace(std::piecewise_construct, std::forward_as_tuple(l), - std::forward_as_tuple(std::move(split_result.left))); + std::forward_as_tuple(std::move(split_result.left), split_result.lmax)); } entries_.emplace(std::piecewise_construct, std::forward_as_tuple(m), - std::forward_as_tuple(std::move(split_result.right))); + std::forward_as_tuple(std::move(split_result.right), split_result.rmax)); DCHECK(TreeIsInCorrectState()); } @@ -249,9 +257,9 @@ void RangeTree::SplitBlock(Map::iterator it) { return false; } - Key prev_range = entries_.begin()->first; + double prev_range = entries_.begin()->first; for (auto it = std::next(entries_.begin()); it != entries_.end(); ++it) { - const Key& current_range = it->first; + const double& current_range = it->first; // Check that ranges are non-overlapping and sorted // Also there can not be gaps between ranges diff --git a/src/core/search/range_tree.h b/src/core/search/range_tree.h index 7805ae073cef..713509cf18f3 100644 --- a/src/core/search/range_tree.h +++ b/src/core/search/range_tree.h @@ -33,14 +33,30 @@ class RangeTree { public: friend class RangeResult; - using RangeNumber = double; - using Key = RangeNumber; using Entry = std::pair; - using RangeBlock = BlockList>; - using Map = absl::btree_map, - PMR_NS::polymorphic_allocator>>; - static constexpr size_t kDefaultMaxRangeBlockSize = 7000; + // Main node of numeric tree + struct RangeBlock : public BlockList> { + template + explicit RangeBlock(PMR_NS::memory_resource* mr, Ts... ts) : BlockList{mr, ts...} { + } + + RangeBlock(BlockList>&& bs, double maxv) + : BlockList{std::move(bs)}, maxv{maxv} { + } + + bool Insert(Entry e) { + maxv = std::max(maxv, e.second); + return BlockList::Insert(e); + } + + double maxv = -std::numeric_limits::infinity(); // max value seen + }; + + using Map = absl::btree_map, + PMR_NS::polymorphic_allocator>>; + + static constexpr size_t kDefaultMaxRangeBlockSize = 10'000; static constexpr size_t kBlockSize = 400; explicit RangeTree(PMR_NS::memory_resource* mr, From c0c158309a51deded3f725547b7823d85fcd93c6 Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Mon, 1 Dec 2025 18:50:37 +0300 Subject: [PATCH 2/6] Add stats, fix split algorithm Signed-off-by: Vladislav Oleshko --- src/core/search/block_list.cc | 4 +-- src/core/search/range_tree.cc | 9 +++++- src/core/search/range_tree.h | 21 +++++++++--- src/core/search/range_tree_test.cc | 52 ++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/src/core/search/block_list.cc b/src/core/search/block_list.cc index ec74f5f3f212..583e80ef7948 100644 --- a/src/core/search/block_list.cc +++ b/src/core/search/block_list.cc @@ -59,9 +59,9 @@ SplitResult Split(BlockList>>&& block_list if (left.Size() < right.Size()) { // If left is smaller, we can add median entries to it - // We need to change median value to the right part - median_value = rmin; + // We need to change median value to the right part and update lmax lmax = median_value; + median_value = rmin; for (const auto& entry : median_entries) { left.Insert(entry); } diff --git a/src/core/search/range_tree.cc b/src/core/search/range_tree.cc index 9f5b3265cd6d..2a86c85080c2 100644 --- a/src/core/search/range_tree.cc +++ b/src/core/search/range_tree.cc @@ -96,7 +96,8 @@ void RangeTree::Add(DocId id, double value) { return; // Block consists just of a single value, not worth trying - if (lb == block.maxv) + // TODO: Detect even before inserting + if (lb == block.max_seen) return; SplitBlock(it); @@ -119,6 +120,7 @@ void RangeTree::Remove(DocId id, double value) { for (auto e : block) lblock.Insert(e); entries_.erase(it); + stats_.merges++; } } } @@ -237,6 +239,7 @@ void RangeTree::SplitBlock(Map::iterator it) { DCHECK(!split_result.right.Empty()); entries_.erase(it); + stats_.splits++; if (l != m) { // If l == m, it means that all values in the block were equal to the median value @@ -251,6 +254,10 @@ void RangeTree::SplitBlock(Map::iterator it) { DCHECK(TreeIsInCorrectState()); } +RangeTree::Stats RangeTree::GetStats() const { + return Stats{.splits = stats_.splits, .merges = stats_.merges, .block_count = entries_.size()}; +} + // Used for DCHECKs to check that the tree is in a correct state. [[maybe_unused]] bool RangeTree::TreeIsInCorrectState() const { if (entries_.empty()) { diff --git a/src/core/search/range_tree.h b/src/core/search/range_tree.h index 713509cf18f3..8aa51dabd8cf 100644 --- a/src/core/search/range_tree.h +++ b/src/core/search/range_tree.h @@ -42,22 +42,22 @@ class RangeTree { } RangeBlock(BlockList>&& bs, double maxv) - : BlockList{std::move(bs)}, maxv{maxv} { + : BlockList{std::move(bs)}, max_seen{maxv} { } bool Insert(Entry e) { - maxv = std::max(maxv, e.second); + max_seen = std::max(max_seen, e.second); return BlockList::Insert(e); } - double maxv = -std::numeric_limits::infinity(); // max value seen + // Max value seen, might be not present anymore + double max_seen = -std::numeric_limits::infinity(); }; using Map = absl::btree_map, PMR_NS::polymorphic_allocator>>; static constexpr size_t kDefaultMaxRangeBlockSize = 10'000; - static constexpr size_t kBlockSize = 400; explicit RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size = kDefaultMaxRangeBlockSize, @@ -80,6 +80,14 @@ class RangeTree { void FinalizeInitialization(); + struct Stats { + size_t splits = 0; + size_t merges = 0; + size_t block_count = 0; + }; + + Stats GetStats() const; + private: Map::iterator FindRangeBlock(double value); Map::const_iterator FindRangeBlock(double value) const; @@ -94,6 +102,11 @@ class RangeTree { size_t max_range_block_size_; Map entries_; + struct { + size_t splits = 0; + size_t merges = 0; + } stats_; + /* During index initialization, we are using temporary buffer to store all entries. That is needed to avoid unnecessary complexity of splitting blocks. After calling FinishInitialization, the tmp_buffer_ is cleared and diff --git a/src/core/search/range_tree_test.cc b/src/core/search/range_tree_test.cc index b096c2ab8140..f78b651e2d52 100644 --- a/src/core/search/range_tree_test.cc +++ b/src/core/search/range_tree_test.cc @@ -310,6 +310,58 @@ TEST_F(RangeTreeTest, Range) { } } +// Don't split single block with same value +TEST_F(RangeTreeTest, SingleBlockSplit) { + RangeTree tree{PMR_NS::get_default_resource(), 4}; + + for (DocId id = 1; id <= 16; id++) + tree.Add(id, 5.0); + + // One split was made to create an empty leftmost block + auto stats = tree.GetStats(); + EXPECT_EQ(stats.splits, 1u); + EXPECT_EQ(stats.block_count, 2u); + + // Add value that causes one more split + tree.Add(20, 6.0); + + stats = tree.GetStats(); + EXPECT_EQ(stats.splits, 2u); + EXPECT_EQ(stats.block_count, 3u); + + // No more splits with same 5.0 + tree.Add(17, 5.0); + stats = tree.GetStats(); + EXPECT_EQ(stats.splits, 2u); + + // Verify block sizes + auto blocks = tree.GetAllBlocks(); + EXPECT_EQ(blocks[0]->Size(), 0u); + EXPECT_EQ(blocks[1]->Size(), 17u); + EXPECT_EQ(blocks[2]->Size(), 1u); +} + +TEST_F(RangeTreeTest, BlockMerge) { + RangeTree tree{PMR_NS::get_default_resource(), 8}; + for (DocId id = 1; id <= 8; id++) + tree.Add(id, id); + + tree.Add(9, 9.0); // will cause split + + auto stats = tree.GetStats(); + EXPECT_EQ(stats.splits, 1u); + EXPECT_EQ(stats.block_count, 2u); + + // Delete all except first and last, should trigger merge + for (DocId id = 2; id <= 8; id++) + tree.Remove(id, id); + + // Only one block left now + stats = tree.GetStats(); + EXPECT_EQ(stats.merges, 1u); + EXPECT_EQ(stats.block_count, 1u); +} + TEST_F(RangeTreeTest, BugNotUniqueDoubleValues) { // TODO: fix the bug GTEST_SKIP() << "Bug not fixed yet"; From 0f2330f5a6530feee059c9e9cae778cd48df75fa Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Tue, 2 Dec 2025 14:07:44 +0300 Subject: [PATCH 3/6] more fixes --- src/core/search/block_list.cc | 6 +++-- src/core/search/block_list.h | 2 +- src/core/search/range_tree.cc | 38 ++++++++++++++++++++---------- src/core/search/range_tree.h | 1 + src/core/search/range_tree_test.cc | 16 +++++++------ 5 files changed, 40 insertions(+), 23 deletions(-) diff --git a/src/core/search/block_list.cc b/src/core/search/block_list.cc index 583e80ef7948..8ff93806e46a 100644 --- a/src/core/search/block_list.cc +++ b/src/core/search/block_list.cc @@ -40,12 +40,13 @@ SplitResult Split(BlockList>>&& block_list left.ReserveBlocks(block_list.blocks_.size() / 2 + 1); right.ReserveBlocks(block_list.blocks_.size() / 2 + 1); - double rmin = std::numeric_limits::infinity(); + double lmin = std::numeric_limits::infinity(), rmin = lmin; double lmax = -std::numeric_limits::infinity(), rmax = lmax; for (const Entry& entry : block_list) { if (entry.second < median_value) { left.PushBack(entry); + lmin = std::min(lmin, entry.second); lmax = std::max(lmax, entry.second); } else if (entry.second > median_value) { right.PushBack(entry); @@ -61,6 +62,7 @@ SplitResult Split(BlockList>>&& block_list // If left is smaller, we can add median entries to it // We need to change median value to the right part and update lmax lmax = median_value; + lmin = std::min(lmin, median_value); median_value = rmin; for (const auto& entry : median_entries) { left.Insert(entry); @@ -74,7 +76,7 @@ SplitResult Split(BlockList>>&& block_list } } - return {std::move(left), std::move(right), median_value, lmax, rmax}; + return {std::move(left), std::move(right), median_value, lmin, lmax, rmax}; } template bool BlockList::Insert(ElementType t) { diff --git a/src/core/search/block_list.h b/src/core/search/block_list.h index fe9addcedfd2..cb9f536ffc88 100644 --- a/src/core/search/block_list.h +++ b/src/core/search/block_list.h @@ -219,6 +219,6 @@ struct SplitResult { Container right; double median; - double lmax, rmax; + double lmin, lmax, rmax; }; } // namespace dfly::search diff --git a/src/core/search/range_tree.cc b/src/core/search/range_tree.cc index 2a86c85080c2..06f29f01ea88 100644 --- a/src/core/search/range_tree.cc +++ b/src/core/search/range_tree.cc @@ -78,9 +78,7 @@ RangeTree::RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size, : max_range_block_size_(max_range_block_size), entries_(mr), enable_splitting_(enable_splitting) { - entries_.emplace( - std::piecewise_construct, std::forward_as_tuple(-std::numeric_limits::infinity()), - std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_)); + CreateEmptyBlock(-std::numeric_limits::infinity()); } void RangeTree::Add(DocId id, double value) { @@ -89,14 +87,22 @@ void RangeTree::Add(DocId id, double value) { auto it = FindRangeBlock(value); auto& [lb, block] = *it; + // Don't disrupt large monovalue blocks, instead create new nextafter block + if (enable_splitting_ && block.Size() >= max_range_block_size_ && + lb == block.max_seen /* monovalue */ && value != lb /* but new value is different*/ + ) { + double lb2 = std::nextafter(lb, std::numeric_limits::infinity()); + CreateEmptyBlock(lb2)->second.Insert({id, value}); + return; + } + auto insert_result = block.Insert({id, value}); LOG_IF(ERROR, !insert_result) << "RangeTree: Failed to insert id: " << id << ", value: " << value; if (!enable_splitting_ || block.Size() <= max_range_block_size_) return; - // Block consists just of a single value, not worth trying - // TODO: Detect even before inserting + // Large monovalue block, not reducable by splitting if (lb == block.max_seen) return; @@ -185,10 +191,7 @@ void RangeTree::FinalizeInitialization() { for (size_t b = 0; b < entries.size(); b += max_range_block_size_) { RangeBlock* range_block; if (b) { - auto it = entries_.emplace( - std::piecewise_construct, std::forward_as_tuple(entries[b].second), - std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_)); - range_block = &it.first->second; + range_block = &CreateEmptyBlock(entries[b].second)->second; } else { range_block = █ } @@ -210,6 +213,13 @@ RangeTree::Map::const_iterator RangeTree::FindRangeBlock(double value) const { return FindRangeBlockImpl(entries_, value); } +RangeTree::Map::iterator RangeTree::CreateEmptyBlock(double lb) { + return entries_ + .emplace(std::piecewise_construct, std::forward_as_tuple(lb), + std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_)) + .first; +} + /* There is an edge case in the SplitBlock method: If split_result.left.Size() == 0, it means that all values in the block @@ -231,7 +241,7 @@ TODO: we can optimize this case by splitting to three blocks: - empty right block with range [std::nextafter(m, +inf), r) */ void RangeTree::SplitBlock(Map::iterator it) { - const double l = it->first; + double l = it->first; auto split_result = Split(std::move(it->second)); @@ -241,9 +251,11 @@ void RangeTree::SplitBlock(Map::iterator it) { entries_.erase(it); stats_.splits++; - if (l != m) { - // If l == m, it means that all values in the block were equal to the median value - // We can not insert an empty block with range [l, l) because it is not valid. + // Insert left block if its not empty, but always keep left infinity bound + if (!split_result.left.Empty() || std::isinf(l)) { + if (!std::isinf(l)) + l = split_result.lmin; + entries_.emplace(std::piecewise_construct, std::forward_as_tuple(l), std::forward_as_tuple(std::move(split_result.left), split_result.lmax)); } diff --git a/src/core/search/range_tree.h b/src/core/search/range_tree.h index 8aa51dabd8cf..6ef6c0fffe47 100644 --- a/src/core/search/range_tree.h +++ b/src/core/search/range_tree.h @@ -92,6 +92,7 @@ class RangeTree { Map::iterator FindRangeBlock(double value); Map::const_iterator FindRangeBlock(double value) const; + Map::iterator CreateEmptyBlock(double lb); void SplitBlock(Map::iterator it); // Used for DCHECKs diff --git a/src/core/search/range_tree_test.cc b/src/core/search/range_tree_test.cc index f78b651e2d52..d4ee2659e561 100644 --- a/src/core/search/range_tree_test.cc +++ b/src/core/search/range_tree_test.cc @@ -114,10 +114,12 @@ TEST_F(RangeTreeTest, Add) { tree.Add(3, 20.0); tree.Add(4, 30.0); tree.Add(5, 30.0); + tree.Add(6, 30.0); auto result = tree.RangeBlocks(10.0, 30.0); - EXPECT_THAT(result, UnorderedElementsAreDocPairs( - {{1, 10.0}, {1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 30.0}, {5, 30.0}})); + EXPECT_THAT(result, + UnorderedElementsAreDocPairs( + {{1, 10.0}, {1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 30.0}, {5, 30.0}, {6, 30.0}})); // Test that the ranges was split correctly result = tree.RangeBlocks(kMinRangeValue, 19.0); @@ -127,7 +129,7 @@ TEST_F(RangeTreeTest, Add) { EXPECT_THAT(result, UnorderedElementsAreDocPairs({{1, 20.0}, {2, 20.0}, {3, 20.0}})); result = tree.RangeBlocks(30.0, kMaxRangeValue); - EXPECT_THAT(result, UnorderedElementsAreDocPairs({{4, 30.0}, {5, 30.0}})); + EXPECT_THAT(result, UnorderedElementsAreDocPairs({{4, 30.0}, {5, 30.0}, {6, 30.0}})); } TEST_F(RangeTreeTest, RemoveSimple) { @@ -322,17 +324,17 @@ TEST_F(RangeTreeTest, SingleBlockSplit) { EXPECT_EQ(stats.splits, 1u); EXPECT_EQ(stats.block_count, 2u); - // Add value that causes one more split + // Add value that causes one a new block started tree.Add(20, 6.0); stats = tree.GetStats(); - EXPECT_EQ(stats.splits, 2u); - EXPECT_EQ(stats.block_count, 3u); + EXPECT_EQ(stats.splits, 1u); // detected ahead, so no split + EXPECT_EQ(stats.block_count, 3u); // but new block // No more splits with same 5.0 tree.Add(17, 5.0); stats = tree.GetStats(); - EXPECT_EQ(stats.splits, 2u); + EXPECT_EQ(stats.splits, 1u); // Verify block sizes auto blocks = tree.GetAllBlocks(); From c71c436b11d7838e0b8086da35dfd8b1f607ba5a Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Tue, 2 Dec 2025 15:39:16 +0300 Subject: [PATCH 4/6] more more fixes Signed-off-by: Vladislav Oleshko --- src/core/search/range_tree.cc | 2 ++ tests/dragonfly/search_test.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/core/search/range_tree.cc b/src/core/search/range_tree.cc index 06f29f01ea88..581a934abd48 100644 --- a/src/core/search/range_tree.cc +++ b/src/core/search/range_tree.cc @@ -91,6 +91,8 @@ void RangeTree::Add(DocId id, double value) { if (enable_splitting_ && block.Size() >= max_range_block_size_ && lb == block.max_seen /* monovalue */ && value != lb /* but new value is different*/ ) { + // We use nextafter as the lower bound to "catch" all other possible inserts into the block, + // as a decreasing `value` sequence would otherwise create lots of single-value blocks double lb2 = std::nextafter(lb, std::numeric_limits::infinity()); CreateEmptyBlock(lb2)->second.Insert({id, value}); return; diff --git a/tests/dragonfly/search_test.py b/tests/dragonfly/search_test.py index 222f4d8ee901..7aeb76a5c87e 100644 --- a/tests/dragonfly/search_test.py +++ b/tests/dragonfly/search_test.py @@ -139,7 +139,17 @@ async def test_management(async_client: aioredis.Redis): assert i1info["num_docs"] == 10 assert sorted(i1info["attributes"]) == [ ["identifier", "f1", "attribute", "f1", "type", "TEXT"], - ["identifier", "f2", "attribute", "f2", "type", "NUMERIC", "SORTABLE", "blocksize", "7000"], + [ + "identifier", + "f2", + "attribute", + "f2", + "type", + "NUMERIC", + "SORTABLE", + "blocksize", + "10000", + ], ] i2info = await i2.info() @@ -163,7 +173,7 @@ async def test_management(async_client: aioredis.Redis): "NOINDEX", "SORTABLE", "blocksize", - "7000", + "10000", ], ["identifier", "f4", "attribute", "f4", "type", "TAG"], ["identifier", "f5", "attribute", "f5", "type", "VECTOR"], From 585dd7540f667ba35108512b1f66919f6a65324e Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Wed, 3 Dec 2025 13:36:40 +0300 Subject: [PATCH 5/6] Add benchmark Signed-off-by: Vladislav Oleshko --- src/core/search/CMakeLists.txt | 2 +- src/core/search/range_tree_test.cc | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/core/search/CMakeLists.txt b/src/core/search/CMakeLists.txt index 1c05dbb1b476..89bdaad539db 100644 --- a/src/core/search/CMakeLists.txt +++ b/src/core/search/CMakeLists.txt @@ -23,7 +23,7 @@ endif() cxx_test(compressed_sorted_set_test dfly_search_core LABELS DFLY) cxx_test(block_list_test dfly_search_core LABELS DFLY) -cxx_test(range_tree_test dfly_search_core LABELS DFLY) +cxx_test(range_tree_test dfly_search_core absl::random_random LABELS DFLY) cxx_test(rax_tree_test redis_test_lib LABELS DFLY) cxx_test(search_parser_test dfly_search_core LABELS DFLY) cxx_test(search_test redis_test_lib dfly_search_core LABELS DFLY) diff --git a/src/core/search/range_tree_test.cc b/src/core/search/range_tree_test.cc index d4ee2659e561..43ed91c50a2d 100644 --- a/src/core/search/range_tree_test.cc +++ b/src/core/search/range_tree_test.cc @@ -4,6 +4,8 @@ #include "core/search/range_tree.h" +#include +#include #include #include @@ -482,4 +484,20 @@ TEST_F(RangeTreeTest, FinalizeInitialization) { {{{1, 10.0}}, {{2, 20.0}, {3, 20.0}, {5, 20.0}}, {{4, 30.0}, {6, 30.0}}, {{7, 40.0}}})); } +// Benchmark tree insertion performance with set of discrete values +static void BM_DiscreteInsertion(benchmark::State& state) { + RangeTree tree{PMR_NS::get_default_resource()}; + + absl::InsecureBitGen gen{}; + size_t variety = state.range(0); + + DocId id = 0; + for (auto _ : state) { + double v = absl::Uniform(gen, 0u, variety); + tree.Add(id++, v); + } +} + +BENCHMARK(BM_DiscreteInsertion)->Arg(2)->Arg(12)->Arg(128)->Arg(1024); + } // namespace dfly::search From e6c9877835829adb7e84919f4254d581253affc4 Mon Sep 17 00:00:00 2001 From: Vladislav Oleshko Date: Thu, 4 Dec 2025 17:32:31 +0300 Subject: [PATCH 6/6] fixes --- src/core/search/block_list.h | 3 +++ src/core/search/range_tree.cc | 27 +++++++++++++--------- src/core/search/range_tree_test.cc | 37 ++++++++++++++++++++---------- 3 files changed, 44 insertions(+), 23 deletions(-) diff --git a/src/core/search/block_list.h b/src/core/search/block_list.h index cb9f536ffc88..254c6f5bf56e 100644 --- a/src/core/search/block_list.h +++ b/src/core/search/block_list.h @@ -217,8 +217,11 @@ struct SplitResult { Container left; Container right; + + // Median value of split, used as minimum value of right block double median; + // Min/max values of left (lmin, lmax) and right (rmin=median, rmax) blocks double lmin, lmax, rmax; }; } // namespace dfly::search diff --git a/src/core/search/range_tree.cc b/src/core/search/range_tree.cc index 581a934abd48..e894474c8851 100644 --- a/src/core/search/range_tree.cc +++ b/src/core/search/range_tree.cc @@ -78,6 +78,8 @@ RangeTree::RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size, : max_range_block_size_(max_range_block_size), entries_(mr), enable_splitting_(enable_splitting) { + // The tree has at least always a block with a negative infinity bound, so that any new insertion + // goes at least somewhere CreateEmptyBlock(-std::numeric_limits::infinity()); } @@ -85,15 +87,16 @@ void RangeTree::Add(DocId id, double value) { DCHECK(std::isfinite(value)); auto it = FindRangeBlock(value); - auto& [lb, block] = *it; + auto& [lower_bound, block] = *it; // Don't disrupt large monovalue blocks, instead create new nextafter block if (enable_splitting_ && block.Size() >= max_range_block_size_ && - lb == block.max_seen /* monovalue */ && value != lb /* but new value is different*/ + lower_bound == block.max_seen /* monovalue */ && + value != lower_bound /* but new value is different*/ ) { // We use nextafter as the lower bound to "catch" all other possible inserts into the block, // as a decreasing `value` sequence would otherwise create lots of single-value blocks - double lb2 = std::nextafter(lb, std::numeric_limits::infinity()); + double lb2 = std::nextafter(lower_bound, std::numeric_limits::infinity()); CreateEmptyBlock(lb2)->second.Insert({id, value}); return; } @@ -105,7 +108,7 @@ void RangeTree::Add(DocId id, double value) { return; // Large monovalue block, not reducable by splitting - if (lb == block.max_seen) + if (lower_bound == block.max_seen) return; SplitBlock(it); @@ -122,7 +125,9 @@ void RangeTree::Remove(DocId id, double value) { // Merge with left block if both are relatively small and won't be forced to split soon if (block.size() < max_range_block_size_ / 4 && it != entries_.begin()) { - auto lit = --it; + auto lit = it; + --lit; + auto& lblock = lit->second; if (block.Size() + lblock.Size() < max_range_block_size_ / 2) { for (auto e : block) @@ -243,7 +248,7 @@ TODO: we can optimize this case by splitting to three blocks: - empty right block with range [std::nextafter(m, +inf), r) */ void RangeTree::SplitBlock(Map::iterator it) { - double l = it->first; + double lower_bound = it->first; auto split_result = Split(std::move(it->second)); @@ -253,12 +258,12 @@ void RangeTree::SplitBlock(Map::iterator it) { entries_.erase(it); stats_.splits++; - // Insert left block if its not empty, but always keep left infinity bound - if (!split_result.left.Empty() || std::isinf(l)) { - if (!std::isinf(l)) - l = split_result.lmin; + // Insert left block if it's not empty or if its the first one (negative inf bound) + if (!split_result.left.Empty() || std::isinf(lower_bound)) { + if (!std::isinf(lower_bound)) // keep negative inf bound + lower_bound = split_result.lmin; - entries_.emplace(std::piecewise_construct, std::forward_as_tuple(l), + entries_.emplace(std::piecewise_construct, std::forward_as_tuple(lower_bound), std::forward_as_tuple(std::move(split_result.left), split_result.lmax)); } diff --git a/src/core/search/range_tree_test.cc b/src/core/search/range_tree_test.cc index 43ed91c50a2d..e3900e777fb3 100644 --- a/src/core/search/range_tree_test.cc +++ b/src/core/search/range_tree_test.cc @@ -326,7 +326,7 @@ TEST_F(RangeTreeTest, SingleBlockSplit) { EXPECT_EQ(stats.splits, 1u); EXPECT_EQ(stats.block_count, 2u); - // Add value that causes one a new block started + // Add value that causes a new block to be started tree.Add(20, 6.0); stats = tree.GetStats(); @@ -345,25 +345,38 @@ TEST_F(RangeTreeTest, SingleBlockSplit) { EXPECT_EQ(blocks[2]->Size(), 1u); } +// Make tree split and then delete every nth value to see if blocks merge properly TEST_F(RangeTreeTest, BlockMerge) { RangeTree tree{PMR_NS::get_default_resource(), 8}; - for (DocId id = 1; id <= 8; id++) + for (DocId id = 1; id <= 64; id++) tree.Add(id, id); - tree.Add(9, 9.0); // will cause split - auto stats = tree.GetStats(); - EXPECT_EQ(stats.splits, 1u); - EXPECT_EQ(stats.block_count, 2u); - - // Delete all except first and last, should trigger merge - for (DocId id = 2; id <= 8; id++) - tree.Remove(id, id); + uint64_t splits = stats.splits; + EXPECT_GT(splits, 8u); + + // Blocks have at least half occupancy + EXPECT_GT(stats.block_count, 64 / 8); + EXPECT_LT(stats.block_count, 2 * 64 / 8); + + // Delete all except %8 = 0, should trigger merge + std::vector expected; + for (DocId id = 1; id <= 64; id++) { + if (id % 8) + tree.Remove(id, id); + else + expected.emplace_back(id, id); + } // Only one block left now stats = tree.GetStats(); - EXPECT_EQ(stats.merges, 1u); - EXPECT_EQ(stats.block_count, 1u); + size_t blocks = stats.block_count; + EXPECT_LT(blocks, 4u); + EXPECT_EQ(stats.merges + blocks - 1, splits); + + // Check the two entries remained + auto result = tree.GetAllBlocks(); + EXPECT_THAT(result, UnorderedElementsAreDocPairs(expected)); } TEST_F(RangeTreeTest, BugNotUniqueDoubleValues) {