Skip to content

Commit e94d2fe

Browse files
authored
Merge pull request #112 from indutny/feature/lookup-and-search
Improve table search speed through lookups
2 parents 243af2c + b4c9ce1 commit e94d2fe

File tree

2 files changed

+387
-21
lines changed

2 files changed

+387
-21
lines changed

scripts/unicode.py

+57-5
Original file line numberDiff line numberDiff line change
@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
274274
pub enum %sCat {
275275
""" % (name, Name, Name))
276276

277+
# We don't want the lookup table to be too large so choose a reasonable
278+
# cutoff. 0x20000 is selected because most of the range table entries are
279+
# within the interval of [0x0, 0x20000]
280+
lookup_value_cutoff = 0x20000
281+
282+
# Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
283+
lookup_table_len = 0x400
284+
285+
lookup_interval = round(lookup_value_cutoff / lookup_table_len)
286+
287+
# Lookup table is a mapping from `character code / lookup_interval` to
288+
# the index in the range table that covers the `character code`.
289+
lookup_table = [0] * lookup_table_len
290+
j = 0
291+
for i in range(0, lookup_table_len):
292+
lookup_from = i * lookup_interval
293+
while j < len(break_table):
294+
(_, entry_to, _) = break_table[j]
295+
if entry_to >= lookup_from:
296+
break
297+
j += 1
298+
lookup_table[i] = j
299+
277300
break_cats.append("Any")
278301
break_cats.sort()
279302
for cat in break_cats:
280303
f.write((" %sC_" % Name[0]) + cat + ",\n")
281304
f.write(""" }
282305
283-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
306+
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
284307
use core::cmp::Ordering::{Equal, Less, Greater};
285308
match r.binary_search_by(|&(lo, hi, _)| {
286309
if lo <= c && c <= hi { Equal }
@@ -293,19 +316,48 @@ def emit_break_module(f, break_table, break_cats, name):
293316
}
294317
Err(idx) => {
295318
(
296-
if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
297-
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX),
319+
if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
320+
r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper),
298321
%sC_Any,
299322
)
300323
}
301324
}
302325
}
303326
304327
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
305-
bsearch_range_value_table(c, %s_cat_table)
328+
// Perform a quick O(1) lookup in a precomputed table to determine
329+
// the slice of the range table to search in.
330+
let lookup_interval = 0x%x;
331+
let idx = (c as u32 / lookup_interval) as usize;
332+
let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
333+
// If the `idx` is outside of the precomputed table - use the slice
334+
// starting from the last covered index in the precomputed table and
335+
// ending with the length of the range table.
336+
%d..%d,
337+
|r| (r[0] as usize)..((r[1] + 1) as usize)
338+
);
339+
340+
// Compute pessimistic default lower and upper bounds on the category.
341+
// If character doesn't map to any range and there is no adjacent range
342+
// in the table slice - these bounds has to apply.
343+
let lower = idx as u32 * lookup_interval;
344+
let upper = lower + lookup_interval - 1;
345+
bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
306346
}
307347
308-
""" % (Name, Name, Name[0], name, Name, name))
348+
""" % (Name, Name, Name[0], name, Name, lookup_interval, name, j, len(break_table), name))
349+
350+
351+
if len(break_table) <= 0xff:
352+
lookup_type = "u8"
353+
elif len(break_table) <= 0xffff:
354+
lookup_type = "u16"
355+
else:
356+
lookup_type = "u32"
357+
358+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
359+
pfun=lambda x: "%d" % x,
360+
is_pub=False, is_const=True)
309361

310362
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
311363
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),

0 commit comments

Comments
 (0)