@@ -274,13 +274,36 @@ def emit_break_module(f, break_table, break_cats, name):
274
274
pub enum %sCat {
275
275
""" % (name , Name , Name ))
276
276
277
+ # We don't want the lookup table to be too large so choose a reasonable
278
+ # cutoff. 0x20000 is selected because most of the range table entries are
279
+ # within the interval of [0x0, 0x20000]
280
+ lookup_value_cutoff = 0x20000
281
+
282
+ # Length of lookup table. It has to be a divisor of `lookup_value_cutoff`.
283
+ lookup_table_len = 0x400
284
+
285
+ lookup_interval = round (lookup_value_cutoff / lookup_table_len )
286
+
287
+ # Lookup table is a mapping from `character code / lookup_interval` to
288
+ # the index in the range table that covers the `character code`.
289
+ lookup_table = [0 ] * lookup_table_len
290
+ j = 0
291
+ for i in range (0 , lookup_table_len ):
292
+ lookup_from = i * lookup_interval
293
+ while j < len (break_table ):
294
+ (_ , entry_to , _ ) = break_table [j ]
295
+ if entry_to >= lookup_from :
296
+ break
297
+ j += 1
298
+ lookup_table [i ] = j
299
+
277
300
break_cats .append ("Any" )
278
301
break_cats .sort ()
279
302
for cat in break_cats :
280
303
f .write ((" %sC_" % Name [0 ]) + cat + ",\n " )
281
304
f .write (""" }
282
305
283
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)]) -> (u32, u32, %sCat) {
306
+ fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32 ) -> (u32, u32, %sCat) {
284
307
use core::cmp::Ordering::{Equal, Less, Greater};
285
308
match r.binary_search_by(|&(lo, hi, _)| {
286
309
if lo <= c && c <= hi { Equal }
@@ -293,19 +316,48 @@ def emit_break_module(f, break_table, break_cats, name):
293
316
}
294
317
Err(idx) => {
295
318
(
296
- if idx > 0 { r[idx-1].1 as u32 + 1 } else { 0 },
297
- r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(core::u32::MAX ),
319
+ if idx > 0 { r[idx-1].1 as u32 + 1 } else { default_lower },
320
+ r.get(idx).map(|c|c.0 as u32 - 1).unwrap_or(default_upper ),
298
321
%sC_Any,
299
322
)
300
323
}
301
324
}
302
325
}
303
326
304
327
pub fn %s_category(c: char) -> (u32, u32, %sCat) {
305
- bsearch_range_value_table(c, %s_cat_table)
328
+ // Perform a quick O(1) lookup in a precomputed table to determine
329
+ // the slice of the range table to search in.
330
+ let lookup_interval = 0x%x;
331
+ let idx = (c as u32 / lookup_interval) as usize;
332
+ let range = %s_cat_lookup.get(idx..(idx + 2)).map_or(
333
+ // If the `idx` is outside of the precomputed table - use the slice
334
+ // starting from the last covered index in the precomputed table and
335
+ // ending with the length of the range table.
336
+ %d..%d,
337
+ |r| (r[0] as usize)..((r[1] + 1) as usize)
338
+ );
339
+
340
+ // Compute pessimistic default lower and upper bounds on the category.
341
+ // If character doesn't map to any range and there is no adjacent range
342
+ // in the table slice - these bounds has to apply.
343
+ let lower = idx as u32 * lookup_interval;
344
+ let upper = lower + lookup_interval - 1;
345
+ bsearch_range_value_table(c, &%s_cat_table[range], lower, upper)
306
346
}
307
347
308
- """ % (Name , Name , Name [0 ], name , Name , name ))
348
+ """ % (Name , Name , Name [0 ], name , Name , lookup_interval , name , j , len (break_table ), name ))
349
+
350
+
351
+ if len (break_table ) <= 0xff :
352
+ lookup_type = "u8"
353
+ elif len (break_table ) <= 0xffff :
354
+ lookup_type = "u16"
355
+ else :
356
+ lookup_type = "u32"
357
+
358
+ emit_table (f , "%s_cat_lookup" % name , lookup_table , "&'static [%s]" % lookup_type ,
359
+ pfun = lambda x : "%d" % x ,
360
+ is_pub = False , is_const = True )
309
361
310
362
emit_table (f , "%s_cat_table" % name , break_table , "&'static [(char, char, %sCat)]" % Name ,
311
363
pfun = lambda x : "(%s,%s,%sC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), Name [0 ], x [2 ]),
0 commit comments