Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions src/core/search/block_list.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,18 @@ SplitResult Split(BlockList<SortedVector<std::pair<DocId, double>>>&& block_list
left.ReserveBlocks(block_list.blocks_.size() / 2 + 1);
right.ReserveBlocks(block_list.blocks_.size() / 2 + 1);

double min_value_in_right_part = std::numeric_limits<double>::infinity();
double lmin = std::numeric_limits<double>::infinity(), rmin = lmin;
double lmax = -std::numeric_limits<double>::infinity(), rmax = lmax;

for (const Entry& entry : block_list) {
if (entry.second < median_value) {
left.PushBack(entry);
lmin = std::min(lmin, entry.second);
lmax = std::max(lmax, entry.second);
} else if (entry.second > median_value) {
right.PushBack(entry);
if (entry.second < min_value_in_right_part) {
min_value_in_right_part = entry.second;
}
rmin = std::min(rmin, entry.second);
rmax = std::max(rmax, entry.second);
} else {
median_entries.push_back(entry);
}
Expand All @@ -57,20 +60,23 @@ SplitResult Split(BlockList<SortedVector<std::pair<DocId, double>>>&& block_list

if (left.Size() < right.Size()) {
// If left is smaller, we can add median entries to it
// We need to change median value to the right part
median_value = min_value_in_right_part;
// We need to change median value to the right part and update lmax
lmax = median_value;
lmin = std::min(lmin, median_value);
median_value = rmin;
for (const auto& entry : median_entries) {
left.Insert(entry);
}
} else {
// If right part is smaller, we can add median entries to it
// Median value is still the same
rmax = std::max(rmax, median_value);
for (const auto& entry : median_entries) {
right.Insert(entry);
}
}

return {std::move(left), std::move(right), median_value};
return {std::move(left), std::move(right), median_value, lmin, lmax, rmax};
}

template <typename C> bool BlockList<C>::Insert(ElementType t) {
Expand Down
2 changes: 2 additions & 0 deletions src/core/search/block_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,5 +218,7 @@ struct SplitResult {
Container left;
Container right;
double median;

double lmin, lmax, rmax;
};
} // namespace dfly::search
79 changes: 54 additions & 25 deletions src/core/search/range_tree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,26 +78,37 @@ RangeTree::RangeTree(PMR_NS::memory_resource* mr, size_t max_range_block_size,
: max_range_block_size_(max_range_block_size),
entries_(mr),
enable_splitting_(enable_splitting) {
entries_.emplace(
std::piecewise_construct,
std::forward_as_tuple(-std::numeric_limits<RangeNumber>::infinity()),
std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_));
CreateEmptyBlock(-std::numeric_limits<double>::infinity());
}

void RangeTree::Add(DocId id, double value) {
DCHECK(std::isfinite(value));

auto it = FindRangeBlock(value);
RangeBlock& block = it->second;
auto& [lb, block] = *it;

// Don't disrupt large monovalue blocks, instead create new nextafter block
if (enable_splitting_ && block.Size() >= max_range_block_size_ &&
lb == block.max_seen /* monovalue */ && value != lb /* but new value is different*/
) {
// We use nextafter as the lower bound to "catch" all other possible inserts into the block,
// as a decreasing `value` sequence would otherwise create lots of single-value blocks
double lb2 = std::nextafter(lb, std::numeric_limits<double>::infinity());
CreateEmptyBlock(lb2)->second.Insert({id, value});
return;
}

auto insert_result = block.Insert({id, value});
LOG_IF(ERROR, !insert_result) << "RangeTree: Failed to insert id: " << id << ", value: " << value;

if (!enable_splitting_ || block.Size() <= max_range_block_size_) {
if (!enable_splitting_ || block.Size() <= max_range_block_size_)
return;
}

SplitBlock(std::move(it));
// Large monovalue block, not reducable by splitting
if (lb == block.max_seen)
return;

SplitBlock(it);
}

void RangeTree::Remove(DocId id, double value) {
Expand All @@ -109,10 +120,17 @@ void RangeTree::Remove(DocId id, double value) {
auto remove_result = block.Remove({id, value});
LOG_IF(ERROR, !remove_result) << "RangeTree: Failed to remove id: " << id << ", value: " << value;

// TODO: maybe merging blocks if they are too small
// The problem that for each mutable operation we do Remove and then Add,
// So we can do merge and split for one operation.
// Or in common cases users do not remove a lot of documents?
// Merge with left block if both are relatively small and won't be forced to split soon
if (block.size() < max_range_block_size_ / 4 && it != entries_.begin()) {
auto lit = --it;
auto& lblock = lit->second;
if (block.Size() + lblock.Size() < max_range_block_size_ / 2) {
for (auto e : block)
lblock.Insert(e);
entries_.erase(it);
stats_.merges++;
}
}
}

RangeResult RangeTree::Range(double l, double r) const {
Expand Down Expand Up @@ -175,10 +193,7 @@ void RangeTree::FinalizeInitialization() {
for (size_t b = 0; b < entries.size(); b += max_range_block_size_) {
RangeBlock* range_block;
if (b) {
auto it = entries_.emplace(
std::piecewise_construct, std::forward_as_tuple(entries[b].second),
std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_));
range_block = &it.first->second;
range_block = &CreateEmptyBlock(entries[b].second)->second;
} else {
range_block = &block;
}
Expand All @@ -200,6 +215,13 @@ RangeTree::Map::const_iterator RangeTree::FindRangeBlock(double value) const {
return FindRangeBlockImpl(entries_, value);
}

RangeTree::Map::iterator RangeTree::CreateEmptyBlock(double lb) {
return entries_
.emplace(std::piecewise_construct, std::forward_as_tuple(lb),
std::forward_as_tuple(entries_.get_allocator().resource(), max_range_block_size_))
.first;
}

/*
There is an edge case in the SplitBlock method:
If split_result.left.Size() == 0, it means that all values in the block
Expand All @@ -221,37 +243,44 @@ TODO: we can optimize this case by splitting to three blocks:
- empty right block with range [std::nextafter(m, +inf), r)
*/
void RangeTree::SplitBlock(Map::iterator it) {
const RangeNumber l = it->first;
double l = it->first;

auto split_result = Split(std::move(it->second));

const RangeNumber m = split_result.median;
const double m = split_result.median;
DCHECK(!split_result.right.Empty());

entries_.erase(it);
stats_.splits++;

// Insert left block if its not empty, but always keep left infinity bound
if (!split_result.left.Empty() || std::isinf(l)) {
if (!std::isinf(l))
l = split_result.lmin;

if (l != m) {
// If l == m, it means that all values in the block were equal to the median value
// We can not insert an empty block with range [l, l) because it is not valid.
entries_.emplace(std::piecewise_construct, std::forward_as_tuple(l),
std::forward_as_tuple(std::move(split_result.left)));
std::forward_as_tuple(std::move(split_result.left), split_result.lmax));
}

entries_.emplace(std::piecewise_construct, std::forward_as_tuple(m),
std::forward_as_tuple(std::move(split_result.right)));
std::forward_as_tuple(std::move(split_result.right), split_result.rmax));

DCHECK(TreeIsInCorrectState());
}

RangeTree::Stats RangeTree::GetStats() const {
return Stats{.splits = stats_.splits, .merges = stats_.merges, .block_count = entries_.size()};
}

// Used for DCHECKs to check that the tree is in a correct state.
[[maybe_unused]] bool RangeTree::TreeIsInCorrectState() const {
if (entries_.empty()) {
return false;
}

Key prev_range = entries_.begin()->first;
double prev_range = entries_.begin()->first;
for (auto it = std::next(entries_.begin()); it != entries_.end(); ++it) {
const Key& current_range = it->first;
const double& current_range = it->first;

// Check that ranges are non-overlapping and sorted
// Also there can not be gaps between ranges
Expand Down
44 changes: 37 additions & 7 deletions src/core/search/range_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,31 @@ class RangeTree {
public:
friend class RangeResult;

using RangeNumber = double;
using Key = RangeNumber;
using Entry = std::pair<DocId, double>;
using RangeBlock = BlockList<SortedVector<Entry>>;
using Map = absl::btree_map<Key, RangeBlock, std::less<Key>,
PMR_NS::polymorphic_allocator<std::pair<const Key, RangeBlock>>>;

static constexpr size_t kDefaultMaxRangeBlockSize = 7000;
static constexpr size_t kBlockSize = 400;
// Main node of numeric tree
struct RangeBlock : public BlockList<SortedVector<Entry>> {
template <typename... Ts>
explicit RangeBlock(PMR_NS::memory_resource* mr, Ts... ts) : BlockList{mr, ts...} {
}

RangeBlock(BlockList<SortedVector<Entry>>&& bs, double maxv)
: BlockList{std::move(bs)}, max_seen{maxv} {
}

bool Insert(Entry e) {
max_seen = std::max(max_seen, e.second);
return BlockList::Insert(e);
}

// Max value seen, might be not present anymore
double max_seen = -std::numeric_limits<double>::infinity();
};

using Map = absl::btree_map<double, RangeBlock, std::less<>,
PMR_NS::polymorphic_allocator<std::pair<double, RangeBlock>>>;

static constexpr size_t kDefaultMaxRangeBlockSize = 10'000;

explicit RangeTree(PMR_NS::memory_resource* mr,
size_t max_range_block_size = kDefaultMaxRangeBlockSize,
Expand All @@ -64,10 +80,19 @@ class RangeTree {

void FinalizeInitialization();

struct Stats {
size_t splits = 0;
size_t merges = 0;
size_t block_count = 0;
};

Stats GetStats() const;

private:
Map::iterator FindRangeBlock(double value);
Map::const_iterator FindRangeBlock(double value) const;

Map::iterator CreateEmptyBlock(double lb);
void SplitBlock(Map::iterator it);

// Used for DCHECKs
Expand All @@ -78,6 +103,11 @@ class RangeTree {
size_t max_range_block_size_;
Map entries_;

struct {
size_t splits = 0;
size_t merges = 0;
} stats_;

/* During index initialization, we are using temporary buffer to store all entries.
That is needed to avoid unnecessary complexity of splitting blocks.
After calling FinishInitialization, the tmp_buffer_ is cleared and
Expand Down
60 changes: 57 additions & 3 deletions src/core/search/range_tree_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,12 @@ TEST_F(RangeTreeTest, Add) {
tree.Add(3, 20.0);
tree.Add(4, 30.0);
tree.Add(5, 30.0);
tree.Add(6, 30.0);

auto result = tree.RangeBlocks(10.0, 30.0);
EXPECT_THAT(result, UnorderedElementsAreDocPairs(
{{1, 10.0}, {1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 30.0}, {5, 30.0}}));
EXPECT_THAT(result,
UnorderedElementsAreDocPairs(
{{1, 10.0}, {1, 20.0}, {2, 20.0}, {3, 20.0}, {4, 30.0}, {5, 30.0}, {6, 30.0}}));

// Test that the ranges was split correctly
result = tree.RangeBlocks(kMinRangeValue, 19.0);
Expand All @@ -127,7 +129,7 @@ TEST_F(RangeTreeTest, Add) {
EXPECT_THAT(result, UnorderedElementsAreDocPairs({{1, 20.0}, {2, 20.0}, {3, 20.0}}));

result = tree.RangeBlocks(30.0, kMaxRangeValue);
EXPECT_THAT(result, UnorderedElementsAreDocPairs({{4, 30.0}, {5, 30.0}}));
EXPECT_THAT(result, UnorderedElementsAreDocPairs({{4, 30.0}, {5, 30.0}, {6, 30.0}}));
}

TEST_F(RangeTreeTest, RemoveSimple) {
Expand Down Expand Up @@ -310,6 +312,58 @@ TEST_F(RangeTreeTest, Range) {
}
}

// Don't split single block with same value
TEST_F(RangeTreeTest, SingleBlockSplit) {
RangeTree tree{PMR_NS::get_default_resource(), 4};

for (DocId id = 1; id <= 16; id++)
tree.Add(id, 5.0);

// One split was made to create an empty leftmost block
auto stats = tree.GetStats();
EXPECT_EQ(stats.splits, 1u);
EXPECT_EQ(stats.block_count, 2u);

// Add value that causes one a new block started
tree.Add(20, 6.0);

stats = tree.GetStats();
EXPECT_EQ(stats.splits, 1u); // detected ahead, so no split
EXPECT_EQ(stats.block_count, 3u); // but new block

// No more splits with same 5.0
tree.Add(17, 5.0);
stats = tree.GetStats();
EXPECT_EQ(stats.splits, 1u);

// Verify block sizes
auto blocks = tree.GetAllBlocks();
EXPECT_EQ(blocks[0]->Size(), 0u);
EXPECT_EQ(blocks[1]->Size(), 17u);
EXPECT_EQ(blocks[2]->Size(), 1u);
}

TEST_F(RangeTreeTest, BlockMerge) {
RangeTree tree{PMR_NS::get_default_resource(), 8};
for (DocId id = 1; id <= 8; id++)
tree.Add(id, id);

tree.Add(9, 9.0); // will cause split

auto stats = tree.GetStats();
EXPECT_EQ(stats.splits, 1u);
EXPECT_EQ(stats.block_count, 2u);

// Delete all except first and last, should trigger merge
for (DocId id = 2; id <= 8; id++)
tree.Remove(id, id);

// Only one block left now
stats = tree.GetStats();
EXPECT_EQ(stats.merges, 1u);
EXPECT_EQ(stats.block_count, 1u);
}

TEST_F(RangeTreeTest, BugNotUniqueDoubleValues) {
// TODO: fix the bug
GTEST_SKIP() << "Bug not fixed yet";
Expand Down
12 changes: 11 additions & 1 deletion tests/dragonfly/search_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,17 @@ async def test_management(async_client: aioredis.Redis):
assert i1info["num_docs"] == 10
assert sorted(i1info["attributes"]) == [
["identifier", "f1", "attribute", "f1", "type", "TEXT"],
["identifier", "f2", "attribute", "f2", "type", "NUMERIC", "SORTABLE", "blocksize", "7000"],
[
"identifier",
"f2",
"attribute",
"f2",
"type",
"NUMERIC",
"SORTABLE",
"blocksize",
"10000",
],
]

i2info = await i2.info()
Expand Down
Loading