Skip to content

Commit 52e6527

Browse files
committed
Remove most of the unnecessary UTF-8 logic from the tokenizer.
1 parent b509dc0 commit 52e6527

File tree

1 file changed

+15
-19
lines changed

1 file changed

+15
-19
lines changed

src/tokenizer.rs

+15-19
Original file line numberDiff line numberDiff line change
@@ -359,17 +359,14 @@ impl<'a> Tokenizer<'a> {
359359
}
360360

361361
#[inline]
362-
fn next_char(&self) -> char { self.char_at(0) }
363-
364-
#[inline]
365-
fn char_at(&self, offset: usize) -> char {
366-
self.input[self.position + offset..].chars().next().unwrap()
362+
fn next_char(&self) -> char {
363+
self.input[self.position..].chars().next().unwrap()
367364
}
368365

369366
#[inline]
370367
fn has_newline_at(&self, offset: usize) -> bool {
371368
self.position + offset < self.input.len() &&
372-
matches!(self.char_at(offset), '\n' | '\r' | '\x0C')
369+
matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
373370
}
374371

375372
#[inline]
@@ -405,8 +402,8 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
405402
if tokenizer.is_eof() {
406403
return Err(())
407404
}
408-
let c = tokenizer.next_byte_unchecked();
409-
let token = match_byte! { c,
405+
let b = tokenizer.next_byte_unchecked();
406+
let token = match_byte! { b,
410407
b'\t' | b'\n' | b' ' | b'\r' | b'\x0C' => {
411408
let start_position = tokenizer.position();
412409
tokenizer.advance(1);
@@ -425,7 +422,7 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
425422
else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
426423
b'a'...b'z' | b'A'...b'Z' | b'0'...b'9' | b'-' | b'_' => true,
427424
b'\\' => !tokenizer.has_newline_at(1),
428-
_ => !c.is_ascii(),
425+
_ => !b.is_ascii(),
429426
} { Hash(consume_name(tokenizer)) }
430427
else { Delim('#') }
431428
},
@@ -555,12 +552,11 @@ fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
555552
else { tokenizer.advance(1); Delim('~') }
556553
},
557554
_ => {
558-
if !c.is_ascii() {
555+
if !b.is_ascii() {
559556
consume_ident_like(tokenizer)
560557
} else {
561-
let ret = Delim(tokenizer.next_char());
562558
tokenizer.advance(1);
563-
ret
559+
Delim(b as char)
564560
}
565561
},
566562
};
@@ -671,11 +667,11 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
671667
true
672668
}
673669
b'\\' => { !tokenizer.has_newline_at(1) }
674-
c => { !c.is_ascii() },
670+
b => { !b.is_ascii() },
675671
}
676672
},
677673
b'\\' => { !tokenizer.has_newline_at(1) },
678-
c => { !c.is_ascii() },
674+
b => { !b.is_ascii() },
679675
}
680676
}
681677

@@ -759,9 +755,9 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
759755
// Do all the math in f64 so that large numbers overflow to +/-inf
760756
// and i32::{MIN, MAX} are within range.
761757

762-
let (has_sign, sign) = match tokenizer.next_char() {
763-
'-' => (true, -1.),
764-
'+' => (true, 1.),
758+
let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
759+
b'-' => (true, -1.),
760+
b'+' => (true, 1.),
765761
_ => (false, 1.),
766762
};
767763
if has_sign {
@@ -780,8 +776,8 @@ fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
780776
let mut is_integer = true;
781777

782778
let mut fractional_part: f64 = 0.;
783-
if tokenizer.has_at_least(1) && tokenizer.next_char() == '.'
784-
&& matches!(tokenizer.char_at(1), '0'...'9') {
779+
if tokenizer.has_at_least(1) && tokenizer.next_byte_unchecked() == b'.'
780+
&& matches!(tokenizer.byte_at(1), b'0'...b'9') {
785781
is_integer = false;
786782
tokenizer.advance(1); // Consume '.'
787783
let mut factor = 0.1;

0 commit comments

Comments
 (0)