@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743
744744struct State < ' a > {
745745 peekable : Peekable < Chars < ' a > > ,
746- pub line : u64 ,
747- pub col : u64 ,
746+ /// Reference to the original source string being tokenized
747+ source : & ' a str ,
748+ line : u64 ,
749+ col : u64 ,
750+ /// Byte position in the source string
751+ byte_pos : usize ,
748752}
749753
750754impl State < ' _ > {
@@ -759,6 +763,8 @@ impl State<'_> {
759763 } else {
760764 self . col += 1 ;
761765 }
766+ // Update byte position (characters can be multi-byte in UTF-8)
767+ self . byte_pos += s. len_utf8 ( ) ;
762768 Some ( s)
763769 }
764770 }
@@ -769,6 +775,16 @@ impl State<'_> {
769775 self . peekable . peek ( )
770776 }
771777
778+ /// Return the character `n` positions ahead without advancing the stream.
779+ /// For example, `peek_nth(0)` returns the current character (same as peek),
780+ /// and `peek_nth(1)` returns the next character.
781+ pub fn peek_nth ( & self , n : usize ) -> Option < char > {
782+ if self . byte_pos >= self . source . len ( ) {
783+ return None ;
784+ }
785+ self . source [ self . byte_pos ..] . chars ( ) . nth ( n)
786+ }
787+
772788 pub fn location ( & self ) -> Location {
773789 Location {
774790 line : self . line ,
@@ -893,8 +909,10 @@ impl<'a> Tokenizer<'a> {
893909 ) -> Result < ( ) , TokenizerError > {
894910 let mut state = State {
895911 peekable : self . query . chars ( ) . peekable ( ) ,
912+ source : self . query ,
896913 line : 1 ,
897914 col : 1 ,
915+ byte_pos : 0 ,
898916 } ;
899917
900918 let mut location = state. location ( ) ;
@@ -908,22 +926,24 @@ impl<'a> Tokenizer<'a> {
908926 Ok ( ( ) )
909927 }
910928
911- // Tokenize the identifier or keywords in `ch`
929+ /// Tokenize an identifier or keyword after consuming the first character(s).
930+ /// `consumed_byte_len` is the total byte length of the character(s) already consumed.
912931 fn tokenize_identifier_or_keyword (
913932 & self ,
914- ch : impl IntoIterator < Item = char > ,
915- chars : & mut State ,
933+ consumed_byte_len : usize ,
934+ chars : & mut State < ' a > ,
916935 ) -> Result < Option < Token > , TokenizerError > {
917936 chars. next ( ) ; // consume the first char
918- let ch: String = ch. into_iter ( ) . collect ( ) ;
919- let word = self . tokenize_word ( ch, chars) ;
937+ let word = self . tokenize_word ( consumed_byte_len, chars) ;
920938
921939 // TODO: implement parsing of exponent here
922940 if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
923941 let mut inner_state = State {
924942 peekable : word. chars ( ) . peekable ( ) ,
943+ source : & word,
925944 line : 0 ,
926945 col : 0 ,
946+ byte_pos : 0 ,
927947 } ;
928948 let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
929949 let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
@@ -937,7 +957,7 @@ impl<'a> Tokenizer<'a> {
937957 /// Get the next token or return None
938958 fn next_token (
939959 & self ,
940- chars : & mut State ,
960+ chars : & mut State < ' a > ,
941961 prev_token : Option < & Token > ,
942962 ) -> Result < Option < Token > , TokenizerError > {
943963 match chars. peek ( ) {
@@ -988,7 +1008,7 @@ impl<'a> Tokenizer<'a> {
9881008 }
9891009 _ => {
9901010 // regular identifier starting with an "b" or "B"
991- let s = self . tokenize_word ( b, chars) ;
1011+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
9921012 Ok ( Some ( Token :: make_word ( & s, None ) ) )
9931013 }
9941014 }
@@ -1015,7 +1035,7 @@ impl<'a> Tokenizer<'a> {
10151035 ) ,
10161036 _ => {
10171037 // regular identifier starting with an "r" or "R"
1018- let s = self . tokenize_word ( b, chars) ;
1038+ let s = self . tokenize_word ( b. len_utf8 ( ) , chars) ;
10191039 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10201040 }
10211041 }
@@ -1034,7 +1054,7 @@ impl<'a> Tokenizer<'a> {
10341054 }
10351055 _ => {
10361056 // regular identifier starting with an "N"
1037- let s = self . tokenize_word ( n, chars) ;
1057+ let s = self . tokenize_word ( n. len_utf8 ( ) , chars) ;
10381058 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10391059 }
10401060 }
@@ -1051,7 +1071,7 @@ impl<'a> Tokenizer<'a> {
10511071 }
10521072 _ => {
10531073 // regular identifier starting with an "E" or "e"
1054- let s = self . tokenize_word ( x, chars) ;
1074+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10551075 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10561076 }
10571077 }
@@ -1070,7 +1090,7 @@ impl<'a> Tokenizer<'a> {
10701090 }
10711091 }
10721092 // regular identifier starting with an "U" or "u"
1073- let s = self . tokenize_word ( x, chars) ;
1093+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10741094 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10751095 }
10761096 // The spec only allows an uppercase 'X' to introduce a hex
@@ -1085,7 +1105,7 @@ impl<'a> Tokenizer<'a> {
10851105 }
10861106 _ => {
10871107 // regular identifier starting with an "X"
1088- let s = self . tokenize_word ( x, chars) ;
1108+ let s = self . tokenize_word ( x. len_utf8 ( ) , chars) ;
10891109 Ok ( Some ( Token :: make_word ( & s, None ) ) )
10901110 }
10911111 }
@@ -1382,7 +1402,8 @@ impl<'a> Tokenizer<'a> {
13821402 match chars. peek ( ) {
13831403 Some ( s) if s. is_whitespace ( ) => Ok ( Some ( Token :: Mod ) ) ,
13841404 Some ( sch) if self . dialect . is_identifier_start ( '%' ) => {
1385- self . tokenize_identifier_or_keyword ( [ ch, * sch] , chars)
1405+ let consumed_byte_len = ch. len_utf8 ( ) + sch. len_utf8 ( ) ;
1406+ self . tokenize_identifier_or_keyword ( consumed_byte_len, chars)
13861407 }
13871408 _ => self . start_binop ( chars, "%" , Token :: Mod ) ,
13881409 }
@@ -1610,7 +1631,8 @@ impl<'a> Tokenizer<'a> {
16101631 self . consume_for_binop ( chars, "##" , Token :: DoubleSharp )
16111632 }
16121633 Some ( sch) if self . dialect . is_identifier_start ( '#' ) => {
1613- self . tokenize_identifier_or_keyword ( [ ch, * sch] , chars)
1634+ let consumed_byte_len = ch. len_utf8 ( ) + sch. len_utf8 ( ) ;
1635+ self . tokenize_identifier_or_keyword ( consumed_byte_len, chars)
16141636 }
16151637 _ => self . start_binop ( chars, "#" , Token :: Sharp ) ,
16161638 }
@@ -1635,7 +1657,9 @@ impl<'a> Tokenizer<'a> {
16351657 match chars. peek ( ) {
16361658 Some ( ' ' ) => Ok ( Some ( Token :: AtAt ) ) ,
16371659 Some ( tch) if self . dialect . is_identifier_start ( '@' ) => {
1638- self . tokenize_identifier_or_keyword ( [ ch, '@' , * tch] , chars)
1660+ let consumed_byte_len =
1661+ ch. len_utf8 ( ) + '@' . len_utf8 ( ) + tch. len_utf8 ( ) ;
1662+ self . tokenize_identifier_or_keyword ( consumed_byte_len, chars)
16391663 }
16401664 _ => Ok ( Some ( Token :: AtAt ) ) ,
16411665 }
@@ -1654,7 +1678,8 @@ impl<'a> Tokenizer<'a> {
16541678 Some ( '\"' ) => Ok ( Some ( Token :: AtSign ) ) ,
16551679 Some ( '`' ) => Ok ( Some ( Token :: AtSign ) ) ,
16561680 Some ( sch) if self . dialect . is_identifier_start ( '@' ) => {
1657- self . tokenize_identifier_or_keyword ( [ ch, * sch] , chars)
1681+ let consumed_byte_len = ch. len_utf8 ( ) + sch. len_utf8 ( ) ;
1682+ self . tokenize_identifier_or_keyword ( consumed_byte_len, chars)
16581683 }
16591684 _ => Ok ( Some ( Token :: AtSign ) ) ,
16601685 }
@@ -1695,7 +1720,8 @@ impl<'a> Tokenizer<'a> {
16951720
16961721 // identifier or keyword
16971722 ch if self . dialect . is_identifier_start ( ch) => {
1698- self . tokenize_identifier_or_keyword ( [ ch] , chars)
1723+ let consumed_byte_len = ch. len_utf8 ( ) ;
1724+ self . tokenize_identifier_or_keyword ( consumed_byte_len, chars)
16991725 }
17001726 '$' => Ok ( Some ( self . tokenize_dollar_preceded_value ( chars) ?) ) ,
17011727
@@ -1876,13 +1902,36 @@ impl<'a> Tokenizer<'a> {
18761902 comment
18771903 }
18781904
1879- /// Tokenize an identifier or keyword, after the first char is already consumed.
1880- fn tokenize_word ( & self , first_chars : impl Into < String > , chars : & mut State ) -> String {
1881- let mut s = first_chars. into ( ) ;
1882- s. push_str ( & peeking_take_while ( chars, |ch| {
1883- self . dialect . is_identifier_part ( ch)
1884- } ) ) ;
1885- s
1905+ /// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
1906+ /// `consumed_byte_len` is the byte length of the consumed character(s).
1907+ fn tokenize_word ( & self , consumed_byte_len : usize , chars : & mut State < ' a > ) -> String {
1908+ // Overflow check: ensure we can safely subtract
1909+ if consumed_byte_len > chars. byte_pos {
1910+ return String :: new ( ) ;
1911+ }
1912+
1913+ // Calculate where the first character started
1914+ let first_char_byte_pos = chars. byte_pos - consumed_byte_len;
1915+
1916+ // Use the zero-copy version and convert to String
1917+ self . tokenize_word_borrowed ( first_char_byte_pos, chars)
1918+ . to_string ( )
1919+ }
1920+
1921+ /// Tokenize an identifier or keyword, returning a borrowed slice when possible.
1922+ /// The first character position must be provided (before it was consumed).
1923+ /// Returns a slice with the same lifetime as the State's source.
1924+ fn tokenize_word_borrowed ( & self , first_char_byte_pos : usize , chars : & mut State < ' a > ) -> & ' a str {
1925+ // Consume the rest of the word
1926+ peeking_take_while_ref ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
1927+
1928+ // Boundary check: ensure first_char_byte_pos is valid
1929+ if first_char_byte_pos > chars. byte_pos || first_char_byte_pos > chars. source . len ( ) {
1930+ return "" ;
1931+ }
1932+
1933+ // Return a slice from the first char to the current position
1934+ & chars. source [ first_char_byte_pos..chars. byte_pos ]
18861935 }
18871936
18881937 /// Read a quoted identifier
@@ -2176,35 +2225,72 @@ impl<'a> Tokenizer<'a> {
21762225/// Read from `chars` until `predicate` returns `false` or EOF is hit.
21772226/// Return the characters read as String, and keep the first non-matching
21782227/// char available as `chars.next()`.
2179- fn peeking_take_while ( chars : & mut State , mut predicate : impl FnMut ( char ) -> bool ) -> String {
2180- let mut s = String :: new ( ) ;
2228+ fn peeking_take_while ( chars : & mut State , predicate : impl FnMut ( char ) -> bool ) -> String {
2229+ peeking_take_while_ref ( chars, predicate) . to_string ( )
2230+ }
2231+
2232+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2233+ /// Returns a borrowed slice of the source string containing the matched characters.
2234+ /// This is the zero-copy version of `peeking_take_while`.
2235+ fn peeking_take_while_ref < ' a > (
2236+ chars : & mut State < ' a > ,
2237+ mut predicate : impl FnMut ( char ) -> bool ,
2238+ ) -> & ' a str {
2239+ // Record the starting byte position
2240+ let start_pos = chars. byte_pos ;
2241+
2242+ // Consume characters while predicate is true
21812243 while let Some ( & ch) = chars. peek ( ) {
21822244 if predicate ( ch) {
2183- chars. next ( ) ; // consume
2184- s. push ( ch) ;
2245+ chars. next ( ) ; // consume (this updates byte_pos)
21852246 } else {
21862247 break ;
21872248 }
21882249 }
2189- s
2250+
2251+ // Get the ending byte position
2252+ let end_pos = chars. byte_pos ;
2253+
2254+ // Return the slice from the original source
2255+ & chars. source [ start_pos..end_pos]
21902256}
21912257
2192- /// Same as peeking_take_while, but also passes the next character to the predicate.
2193- fn peeking_next_take_while (
2194- chars : & mut State ,
2258+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit.
2259+ /// This version also passes the next character to the predicate for lookahead, taking
2260+ /// both the current char and optional next char. Returns a borrowed slice of the source
2261+ /// string containing the matched characters.
2262+ ///
2263+ /// This is a zero-copy version of `peeking_next_take_while`.
2264+ fn peeking_take_while_next_ref < ' a > (
2265+ chars : & mut State < ' a > ,
21952266 mut predicate : impl FnMut ( char , Option < char > ) -> bool ,
2196- ) -> String {
2197- let mut s = String :: new ( ) ;
2267+ ) -> & ' a str {
2268+ // Record the starting byte position
2269+ let start_pos = chars. byte_pos ;
2270+
2271+ // Consume characters while predicate is true
21982272 while let Some ( & ch) = chars. peek ( ) {
2199- let next_char = chars. peekable . clone ( ) . nth ( 1 ) ;
2273+ let next_char = chars. peek_nth ( 1 ) ;
22002274 if predicate ( ch, next_char) {
2201- chars. next ( ) ; // consume
2202- s. push ( ch) ;
2275+ chars. next ( ) ; // consume (this updates byte_pos)
22032276 } else {
22042277 break ;
22052278 }
22062279 }
2207- s
2280+
2281+ // Get the ending byte position
2282+ let end_pos = chars. byte_pos ;
2283+
2284+ // Return the slice from the original source
2285+ & chars. source [ start_pos..end_pos]
2286+ }
2287+
2288+ /// Same as peeking_take_while, but also passes the next character to the predicate.
2289+ fn peeking_next_take_while (
2290+ chars : & mut State ,
2291+ predicate : impl FnMut ( char , Option < char > ) -> bool ,
2292+ ) -> String {
2293+ peeking_take_while_next_ref ( chars, predicate) . to_string ( )
22082294}
22092295
22102296fn unescape_single_quoted_string ( chars : & mut State < ' _ > ) -> Option < String > {
@@ -3496,8 +3582,10 @@ mod tests {
34963582 let s = format ! ( "'{s}'" ) ;
34973583 let mut state = State {
34983584 peekable : s. chars ( ) . peekable ( ) ,
3585+ source : & s,
34993586 line : 0 ,
35003587 col : 0 ,
3588+ byte_pos : 0 ,
35013589 } ;
35023590
35033591 assert_eq ! (
0 commit comments