Skip to content

Commit 0ad848c

Browse files
eyalsatorieyalleshem
authored andcommitted
Prepare tokenizer for using borrowed strings instead of allocations.
Key points for this commit: - The peekable trait isn't sufficient for using string slices, as we need the byte indexes (start/end) to create string slices, so added the current byte position to the State struct (Note: in the long term we could potentially remove peekable and use only the current position as an iterator) - Created internal functions that create slices from the original query instead of allocating strings, then converted these functions to return String to maintain compatibility (the idea is to make a small, reviewable commit without changing the Token struct or the parser)
1 parent 308a723 commit 0ad848c

File tree

2 files changed

+130
-47
lines changed

2 files changed

+130
-47
lines changed

src/ast/mod.rs

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2787,10 +2787,11 @@ impl fmt::Display for Declare {
27872787
}
27882788

27892789
/// Sql options of a `CREATE TABLE` statement.
2790-
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
2790+
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash, Default)]
27912791
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
27922792
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
27932793
pub enum CreateTableOptions {
2794+
#[default]
27942795
None,
27952796
/// Options specified using the `WITH` keyword.
27962797
/// e.g. `WITH (description = "123")`
@@ -2819,12 +2820,6 @@ pub enum CreateTableOptions {
28192820
TableProperties(Vec<SqlOption>),
28202821
}
28212822

2822-
impl Default for CreateTableOptions {
2823-
fn default() -> Self {
2824-
Self::None
2825-
}
2826-
}
2827-
28282823
impl fmt::Display for CreateTableOptions {
28292824
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
28302825
match self {

src/tokenizer.rs

Lines changed: 128 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743

744744
struct State<'a> {
745745
peekable: Peekable<Chars<'a>>,
746-
pub line: u64,
747-
pub col: u64,
746+
/// Reference to the original source string being tokenized
747+
source: &'a str,
748+
line: u64,
749+
col: u64,
750+
/// Byte position in the source string
751+
byte_pos: usize,
748752
}
749753

750754
impl State<'_> {
@@ -759,6 +763,8 @@ impl State<'_> {
759763
} else {
760764
self.col += 1;
761765
}
766+
// Update byte position (characters can be multi-byte in UTF-8)
767+
self.byte_pos += s.len_utf8();
762768
Some(s)
763769
}
764770
}
@@ -769,6 +775,16 @@ impl State<'_> {
769775
self.peekable.peek()
770776
}
771777

778+
/// Return the character `n` positions ahead without advancing the stream.
779+
/// For example, `peek_nth(0)` returns the current character (same as peek),
780+
/// and `peek_nth(1)` returns the next character.
781+
pub fn peek_nth(&self, n: usize) -> Option<char> {
782+
if self.byte_pos >= self.source.len() {
783+
return None;
784+
}
785+
self.source[self.byte_pos..].chars().nth(n)
786+
}
787+
772788
pub fn location(&self) -> Location {
773789
Location {
774790
line: self.line,
@@ -893,8 +909,10 @@ impl<'a> Tokenizer<'a> {
893909
) -> Result<(), TokenizerError> {
894910
let mut state = State {
895911
peekable: self.query.chars().peekable(),
912+
source: self.query,
896913
line: 1,
897914
col: 1,
915+
byte_pos: 0,
898916
};
899917

900918
let mut location = state.location();
@@ -908,22 +926,24 @@ impl<'a> Tokenizer<'a> {
908926
Ok(())
909927
}
910928

911-
// Tokenize the identifier or keywords in `ch`
929+
/// Tokenize an identifier or keyword after consuming the first character(s).
930+
/// `consumed_byte_len` is the total byte length of the character(s) already consumed.
912931
fn tokenize_identifier_or_keyword(
913932
&self,
914-
ch: impl IntoIterator<Item = char>,
915-
chars: &mut State,
933+
consumed_byte_len: usize,
934+
chars: &mut State<'a>,
916935
) -> Result<Option<Token>, TokenizerError> {
917936
chars.next(); // consume the first char
918-
let ch: String = ch.into_iter().collect();
919-
let word = self.tokenize_word(ch, chars);
937+
let word = self.tokenize_word(consumed_byte_len, chars);
920938

921939
// TODO: implement parsing of exponent here
922940
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
923941
let mut inner_state = State {
924942
peekable: word.chars().peekable(),
943+
source: &word,
925944
line: 0,
926945
col: 0,
946+
byte_pos: 0,
927947
};
928948
let mut s = peeking_take_while(&mut inner_state, |ch| matches!(ch, '0'..='9' | '.'));
929949
let s2 = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.'));
@@ -937,7 +957,7 @@ impl<'a> Tokenizer<'a> {
937957
/// Get the next token or return None
938958
fn next_token(
939959
&self,
940-
chars: &mut State,
960+
chars: &mut State<'a>,
941961
prev_token: Option<&Token>,
942962
) -> Result<Option<Token>, TokenizerError> {
943963
match chars.peek() {
@@ -988,7 +1008,7 @@ impl<'a> Tokenizer<'a> {
9881008
}
9891009
_ => {
9901010
// regular identifier starting with an "b" or "B"
991-
let s = self.tokenize_word(b, chars);
1011+
let s = self.tokenize_word(b.len_utf8(), chars);
9921012
Ok(Some(Token::make_word(&s, None)))
9931013
}
9941014
}
@@ -1015,7 +1035,7 @@ impl<'a> Tokenizer<'a> {
10151035
),
10161036
_ => {
10171037
// regular identifier starting with an "r" or "R"
1018-
let s = self.tokenize_word(b, chars);
1038+
let s = self.tokenize_word(b.len_utf8(), chars);
10191039
Ok(Some(Token::make_word(&s, None)))
10201040
}
10211041
}
@@ -1034,7 +1054,7 @@ impl<'a> Tokenizer<'a> {
10341054
}
10351055
_ => {
10361056
// regular identifier starting with an "N"
1037-
let s = self.tokenize_word(n, chars);
1057+
let s = self.tokenize_word(n.len_utf8(), chars);
10381058
Ok(Some(Token::make_word(&s, None)))
10391059
}
10401060
}
@@ -1051,7 +1071,7 @@ impl<'a> Tokenizer<'a> {
10511071
}
10521072
_ => {
10531073
// regular identifier starting with an "E" or "e"
1054-
let s = self.tokenize_word(x, chars);
1074+
let s = self.tokenize_word(x.len_utf8(), chars);
10551075
Ok(Some(Token::make_word(&s, None)))
10561076
}
10571077
}
@@ -1070,7 +1090,7 @@ impl<'a> Tokenizer<'a> {
10701090
}
10711091
}
10721092
// regular identifier starting with an "U" or "u"
1073-
let s = self.tokenize_word(x, chars);
1093+
let s = self.tokenize_word(x.len_utf8(), chars);
10741094
Ok(Some(Token::make_word(&s, None)))
10751095
}
10761096
// The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1105,7 @@ impl<'a> Tokenizer<'a> {
10851105
}
10861106
_ => {
10871107
// regular identifier starting with an "X"
1088-
let s = self.tokenize_word(x, chars);
1108+
let s = self.tokenize_word(x.len_utf8(), chars);
10891109
Ok(Some(Token::make_word(&s, None)))
10901110
}
10911111
}
@@ -1382,7 +1402,8 @@ impl<'a> Tokenizer<'a> {
13821402
match chars.peek() {
13831403
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
13841404
Some(sch) if self.dialect.is_identifier_start('%') => {
1385-
self.tokenize_identifier_or_keyword([ch, *sch], chars)
1405+
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
1406+
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
13861407
}
13871408
_ => self.start_binop(chars, "%", Token::Mod),
13881409
}
@@ -1610,7 +1631,8 @@ impl<'a> Tokenizer<'a> {
16101631
self.consume_for_binop(chars, "##", Token::DoubleSharp)
16111632
}
16121633
Some(sch) if self.dialect.is_identifier_start('#') => {
1613-
self.tokenize_identifier_or_keyword([ch, *sch], chars)
1634+
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
1635+
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
16141636
}
16151637
_ => self.start_binop(chars, "#", Token::Sharp),
16161638
}
@@ -1635,7 +1657,9 @@ impl<'a> Tokenizer<'a> {
16351657
match chars.peek() {
16361658
Some(' ') => Ok(Some(Token::AtAt)),
16371659
Some(tch) if self.dialect.is_identifier_start('@') => {
1638-
self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
1660+
let consumed_byte_len =
1661+
ch.len_utf8() + '@'.len_utf8() + tch.len_utf8();
1662+
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
16391663
}
16401664
_ => Ok(Some(Token::AtAt)),
16411665
}
@@ -1654,7 +1678,8 @@ impl<'a> Tokenizer<'a> {
16541678
Some('\"') => Ok(Some(Token::AtSign)),
16551679
Some('`') => Ok(Some(Token::AtSign)),
16561680
Some(sch) if self.dialect.is_identifier_start('@') => {
1657-
self.tokenize_identifier_or_keyword([ch, *sch], chars)
1681+
let consumed_byte_len = ch.len_utf8() + sch.len_utf8();
1682+
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
16581683
}
16591684
_ => Ok(Some(Token::AtSign)),
16601685
}
@@ -1695,7 +1720,8 @@ impl<'a> Tokenizer<'a> {
16951720

16961721
// identifier or keyword
16971722
ch if self.dialect.is_identifier_start(ch) => {
1698-
self.tokenize_identifier_or_keyword([ch], chars)
1723+
let consumed_byte_len = ch.len_utf8();
1724+
self.tokenize_identifier_or_keyword(consumed_byte_len, chars)
16991725
}
17001726
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
17011727

@@ -1876,13 +1902,36 @@ impl<'a> Tokenizer<'a> {
18761902
comment
18771903
}
18781904

1879-
/// Tokenize an identifier or keyword, after the first char is already consumed.
1880-
fn tokenize_word(&self, first_chars: impl Into<String>, chars: &mut State) -> String {
1881-
let mut s = first_chars.into();
1882-
s.push_str(&peeking_take_while(chars, |ch| {
1883-
self.dialect.is_identifier_part(ch)
1884-
}));
1885-
s
1905+
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1906+
/// `consumed_byte_len` is the byte length of the consumed character(s).
1907+
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
1908+
// Overflow check: ensure we can safely subtract
1909+
if consumed_byte_len > chars.byte_pos {
1910+
return String::new();
1911+
}
1912+
1913+
// Calculate where the first character started
1914+
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
1915+
1916+
// Use the zero-copy version and convert to String
1917+
self.tokenize_word_borrowed(first_char_byte_pos, chars)
1918+
.to_string()
1919+
}
1920+
1921+
/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1922+
/// The first character position must be provided (before it was consumed).
1923+
/// Returns a slice with the same lifetime as the State's source.
1924+
fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
1925+
// Consume the rest of the word
1926+
peeking_take_while_ref(chars, |ch| self.dialect.is_identifier_part(ch));
1927+
1928+
// Boundary check: ensure first_char_byte_pos is valid
1929+
if first_char_byte_pos > chars.byte_pos || first_char_byte_pos > chars.source.len() {
1930+
return "";
1931+
}
1932+
1933+
// Return a slice from the first char to the current position
1934+
&chars.source[first_char_byte_pos..chars.byte_pos]
18861935
}
18871936

18881937
/// Read a quoted identifier
@@ -2176,35 +2225,72 @@ impl<'a> Tokenizer<'a> {
21762225
/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772226
/// Return the characters read as String, and keep the first non-matching
21782227
/// char available as `chars.next()`.
2179-
fn peeking_take_while(chars: &mut State, mut predicate: impl FnMut(char) -> bool) -> String {
2180-
let mut s = String::new();
2228+
fn peeking_take_while(chars: &mut State, predicate: impl FnMut(char) -> bool) -> String {
2229+
peeking_take_while_ref(chars, predicate).to_string()
2230+
}
2231+
2232+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2233+
/// Returns a borrowed slice of the source string containing the matched characters.
2234+
/// This is the zero-copy version of `peeking_take_while`.
2235+
fn peeking_take_while_ref<'a>(
2236+
chars: &mut State<'a>,
2237+
mut predicate: impl FnMut(char) -> bool,
2238+
) -> &'a str {
2239+
// Record the starting byte position
2240+
let start_pos = chars.byte_pos;
2241+
2242+
// Consume characters while predicate is true
21812243
while let Some(&ch) = chars.peek() {
21822244
if predicate(ch) {
2183-
chars.next(); // consume
2184-
s.push(ch);
2245+
chars.next(); // consume (this updates byte_pos)
21852246
} else {
21862247
break;
21872248
}
21882249
}
2189-
s
2250+
2251+
// Get the ending byte position
2252+
let end_pos = chars.byte_pos;
2253+
2254+
// Return the slice from the original source
2255+
&chars.source[start_pos..end_pos]
21902256
}
21912257

2192-
/// Same as peeking_take_while, but also passes the next character to the predicate.
2193-
fn peeking_next_take_while(
2194-
chars: &mut State,
2258+
/// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2259+
/// This version also passes the next character to the predicate for lookahead, taking
2260+
/// both the current char and optional next char. Returns a borrowed slice of the source
2261+
/// string containing the matched characters.
2262+
///
2263+
/// This is a zero-copy version of `peeking_next_take_while`.
2264+
fn peeking_take_while_next_ref<'a>(
2265+
chars: &mut State<'a>,
21952266
mut predicate: impl FnMut(char, Option<char>) -> bool,
2196-
) -> String {
2197-
let mut s = String::new();
2267+
) -> &'a str {
2268+
// Record the starting byte position
2269+
let start_pos = chars.byte_pos;
2270+
2271+
// Consume characters while predicate is true
21982272
while let Some(&ch) = chars.peek() {
2199-
let next_char = chars.peekable.clone().nth(1);
2273+
let next_char = chars.peek_nth(1);
22002274
if predicate(ch, next_char) {
2201-
chars.next(); // consume
2202-
s.push(ch);
2275+
chars.next(); // consume (this updates byte_pos)
22032276
} else {
22042277
break;
22052278
}
22062279
}
2207-
s
2280+
2281+
// Get the ending byte position
2282+
let end_pos = chars.byte_pos;
2283+
2284+
// Return the slice from the original source
2285+
&chars.source[start_pos..end_pos]
2286+
}
2287+
2288+
/// Same as peeking_take_while, but also passes the next character to the predicate.
2289+
fn peeking_next_take_while(
2290+
chars: &mut State,
2291+
predicate: impl FnMut(char, Option<char>) -> bool,
2292+
) -> String {
2293+
peeking_take_while_next_ref(chars, predicate).to_string()
22082294
}
22092295

22102296
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
@@ -3496,8 +3582,10 @@ mod tests {
34963582
let s = format!("'{s}'");
34973583
let mut state = State {
34983584
peekable: s.chars().peekable(),
3585+
source: &s,
34993586
line: 0,
35003587
col: 0,
3588+
byte_pos: 0,
35013589
};
35023590

35033591
assert_eq!(

0 commit comments

Comments
 (0)