From 59583ac554229ec3eea64f3a1a9e1a4c5d4badba Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Sat, 13 Feb 2021 19:33:26 +0100 Subject: [PATCH 1/6] Move lexing of number and string literals into a separate file. Also make them freestanding functions instead of methods. --- compiler/rustc_lexer/src/lib.rs | 336 ++------------------------- compiler/rustc_lexer/src/literals.rs | 309 ++++++++++++++++++++++++ 2 files changed, 327 insertions(+), 318 deletions(-) create mode 100644 compiler/rustc_lexer/src/literals.rs diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 44fc4db7dc199..366e0771320a8 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -22,15 +22,18 @@ // `#![feature]` attributes should be added. mod cursor; +mod literals; pub mod unescape; #[cfg(test)] mod tests; -use self::LiteralKind::*; use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; -use std::convert::TryFrom; +use crate::cursor::Cursor; +use crate::literals::{ + double_quoted_string, lifetime_or_char, number, raw_double_quoted_string, single_quoted_string, + LiteralKind, +}; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -137,55 +140,6 @@ pub enum DocStyle { Inner, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum LiteralKind { - /// "12_u8", "0o100", "0b120i99" - Int { base: Base, empty_int: bool }, - /// "12.34f32", "0b100.100" - Float { base: Base, empty_exponent: bool }, - /// "'a'", "'\\'", "'''", "';" - Char { terminated: bool }, - /// "b'a'", "b'\\'", "b'''", "b';" - Byte { terminated: bool }, - /// ""abc"", ""abc" - Str { terminated: bool }, - /// "b"abc"", "b"abc" - ByteStr { terminated: bool }, - /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" - RawStr { n_hashes: u16, err: Option }, - /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" - RawByteStr { n_hashes: u16, err: Option }, -} - -/// Error produced validating a raw string. Represents cases like: -/// - `r##~"abcde"##`: `InvalidStarter` -/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` -/// - Too many `#`s (>65535): `TooManyDelimiters` -// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum RawStrError { - /// Non `#` characters exist between `r` and `"` eg. `r#~"..` - InvalidStarter { bad_char: char }, - /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they - /// may have intended to terminate it. - NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, - /// More than 65535 `#`s exist. - TooManyDelimiters { found: usize }, -} - -/// Base of numeric literal encoding according to its prefix. -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum Base { - /// Literal starts with "0b". - Binary, - /// Literal starts with "0o". - Octal, - /// Literal starts with "0x". - Hexadecimal, - /// Literal doesn't contain a prefix. - Decimal, -} - /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", /// but shebang isn't a part of rust syntax. pub fn strip_shebang(input: &str) -> Option { @@ -315,12 +269,12 @@ impl Cursor<'_> { 'r' => match (self.first(), self.second()) { ('#', c1) if is_id_start(c1) => self.raw_ident(), ('#', _) | ('"', _) => { - let (n_hashes, err) = self.raw_double_quoted_string(1); + let (n_hashes, err) = raw_double_quoted_string(self, 1); let suffix_start = self.len_consumed(); if err.is_none() { self.eat_literal_suffix(); } - let kind = RawStr { n_hashes, err }; + let kind = LiteralKind::RawStr { n_hashes, err }; Literal { kind, suffix_start } } _ => self.ident(), @@ -330,32 +284,32 @@ impl Cursor<'_> { 'b' => match (self.first(), self.second()) { ('\'', _) => { self.bump(); - let terminated = self.single_quoted_string(); + let terminated = single_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { self.eat_literal_suffix(); } - let kind = Byte { terminated }; + let kind = LiteralKind::Byte { terminated }; Literal { kind, suffix_start } } ('"', _) => { self.bump(); - let terminated = self.double_quoted_string(); + let terminated = double_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { self.eat_literal_suffix(); } - let kind = ByteStr { terminated }; + let kind = LiteralKind::ByteStr { terminated }; Literal { kind, suffix_start } } ('r', '"') | ('r', '#') => { self.bump(); - let (n_hashes, err) = self.raw_double_quoted_string(2); + let (n_hashes, err) = raw_double_quoted_string(self, 2); let suffix_start = self.len_consumed(); if err.is_none() { self.eat_literal_suffix(); } - let kind = RawByteStr { n_hashes, err }; + let kind = LiteralKind::RawByteStr { n_hashes, err }; Literal { kind, suffix_start } } _ => self.ident(), @@ -367,7 +321,7 @@ impl Cursor<'_> { // Numeric literal. c @ '0'..='9' => { - let literal_kind = self.number(c); + let literal_kind = number(self, c); let suffix_start = self.len_consumed(); self.eat_literal_suffix(); TokenKind::Literal { kind: literal_kind, suffix_start } @@ -402,16 +356,16 @@ impl Cursor<'_> { '%' => Percent, // Lifetime or character literal. - '\'' => self.lifetime_or_char(), + '\'' => lifetime_or_char(self), // String literal. '"' => { - let terminated = self.double_quoted_string(); + let terminated = double_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { self.eat_literal_suffix(); } - let kind = Str { terminated }; + let kind = LiteralKind::Str { terminated }; Literal { kind, suffix_start } } _ => Unknown, @@ -494,260 +448,6 @@ impl Cursor<'_> { Ident } - fn number(&mut self, first_digit: char) -> LiteralKind { - debug_assert!('0' <= self.prev() && self.prev() <= '9'); - let mut base = Base::Decimal; - if first_digit == '0' { - // Attempt to parse encoding base. - let has_digits = match self.first() { - 'b' => { - base = Base::Binary; - self.bump(); - self.eat_decimal_digits() - } - 'o' => { - base = Base::Octal; - self.bump(); - self.eat_decimal_digits() - } - 'x' => { - base = Base::Hexadecimal; - self.bump(); - self.eat_hexadecimal_digits() - } - // Not a base prefix. - '0'..='9' | '_' | '.' | 'e' | 'E' => { - self.eat_decimal_digits(); - true - } - // Just a 0. - _ => return Int { base, empty_int: false }, - }; - // Base prefix was provided, but there were no digits - // after it, e.g. "0x". - if !has_digits { - return Int { base, empty_int: true }; - } - } else { - // No base prefix, parse number in the usual way. - self.eat_decimal_digits(); - }; - - match self.first() { - // Don't be greedy if this is actually an - // integer literal followed by field/method access or a range pattern - // (`0..2` and `12.foo()`) - '.' if self.second() != '.' && !is_id_start(self.second()) => { - // might have stuff after the ., and if it does, it needs to start - // with a number - self.bump(); - let mut empty_exponent = false; - if self.first().is_digit(10) { - self.eat_decimal_digits(); - match self.first() { - 'e' | 'E' => { - self.bump(); - empty_exponent = !self.eat_float_exponent(); - } - _ => (), - } - } - Float { base, empty_exponent } - } - 'e' | 'E' => { - self.bump(); - let empty_exponent = !self.eat_float_exponent(); - Float { base, empty_exponent } - } - _ => Int { base, empty_int: false }, - } - } - - fn lifetime_or_char(&mut self) -> TokenKind { - debug_assert!(self.prev() == '\''); - - let can_be_a_lifetime = if self.second() == '\'' { - // It's surely not a lifetime. - false - } else { - // If the first symbol is valid for identifier, it can be a lifetime. - // Also check if it's a number for a better error reporting (so '0 will - // be reported as invalid lifetime and not as unterminated char literal). - is_id_start(self.first()) || self.first().is_digit(10) - }; - - if !can_be_a_lifetime { - let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); - } - let kind = Char { terminated }; - return Literal { kind, suffix_start }; - } - - // Either a lifetime or a character literal with - // length greater than 1. - - let starts_with_number = self.first().is_digit(10); - - // Skip the literal contents. - // First symbol can be a number (which isn't a valid identifier start), - // so skip it without any checks. - self.bump(); - self.eat_while(is_id_continue); - - // Check if after skipping literal contents we've met a closing - // single quote (which means that user attempted to create a - // string with single quotes). - if self.first() == '\'' { - self.bump(); - let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } - } else { - Lifetime { starts_with_number } - } - } - - fn single_quoted_string(&mut self) -> bool { - debug_assert!(self.prev() == '\''); - // Check if it's a one-symbol literal. - if self.second() == '\'' && self.first() != '\\' { - self.bump(); - self.bump(); - return true; - } - - // Literal has more than one symbol. - - // Parse until either quotes are terminated or error is detected. - loop { - match self.first() { - // Quotes are terminated, finish parsing. - '\'' => { - self.bump(); - return true; - } - // Probably beginning of the comment, which we don't want to include - // to the error report. - '/' => break, - // Newline without following '\'' means unclosed quote, stop parsing. - '\n' if self.second() != '\'' => break, - // End of file, stop parsing. - EOF_CHAR if self.is_eof() => break, - // Escaped slash is considered one character, so bump twice. - '\\' => { - self.bump(); - self.bump(); - } - // Skip the character. - _ => { - self.bump(); - } - } - } - // String was not terminated. - false - } - - /// Eats double-quoted string and returns true - /// if string is terminated. - fn double_quoted_string(&mut self) -> bool { - debug_assert!(self.prev() == '"'); - while let Some(c) = self.bump() { - match c { - '"' => { - return true; - } - '\\' if self.first() == '\\' || self.first() == '"' => { - // Bump again to skip escaped character. - self.bump(); - } - _ => (), - } - } - // End of file reached. - false - } - - /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. - fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option) { - // Wrap the actual function to handle the error with too many hashes. - // This way, it eats the whole raw string. - let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); - // Only up to 65535 `#`s are allowed in raw strings - match u16::try_from(n_hashes) { - Ok(num) => (num, err), - // We lie about the number of hashes here :P - Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), - } - } - - fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option) { - debug_assert!(self.prev() == 'r'); - let start_pos = self.len_consumed(); - let mut possible_terminator_offset = None; - let mut max_hashes = 0; - - // Count opening '#' symbols. - let mut eaten = 0; - while self.first() == '#' { - eaten += 1; - self.bump(); - } - let n_start_hashes = eaten; - - // Check that string is started. - match self.bump() { - Some('"') => (), - c => { - let c = c.unwrap_or(EOF_CHAR); - return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); - } - } - - // Skip the string contents and on each '#' character met, check if this is - // a raw string termination. - loop { - self.eat_while(|c| c != '"'); - - if self.is_eof() { - return ( - n_start_hashes, - Some(RawStrError::NoTerminator { - expected: n_start_hashes, - found: max_hashes, - possible_terminator_offset, - }), - ); - } - - // Eat closing double quote. - self.bump(); - - // Check that amount of closing '#' symbols - // is equal to the amount of opening ones. - // Note that this will not consume extra trailing `#` characters: - // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` - // followed by a `#` token. - let mut n_end_hashes = 0; - while self.first() == '#' && n_end_hashes < n_start_hashes { - n_end_hashes += 1; - self.bump(); - } - - if n_end_hashes == n_start_hashes { - return (n_start_hashes, None); - } else if n_end_hashes > max_hashes { - // Keep track of possible terminators to give a hint about - // where there might be a missing terminator - possible_terminator_offset = - Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); - max_hashes = n_end_hashes; - } - } - } - fn eat_decimal_digits(&mut self) -> bool { let mut has_digits = false; loop { diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs new file mode 100644 index 0000000000000..2bfa5f9228b84 --- /dev/null +++ b/compiler/rustc_lexer/src/literals.rs @@ -0,0 +1,309 @@ +use crate::cursor::{Cursor, EOF_CHAR}; +use crate::{is_id_continue, is_id_start, TokenKind}; +use std::convert::TryFrom; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// "12_u8", "0o100", "0b120i99" + Int { base: Base, empty_int: bool }, + /// "12.34f32", "0b100.100" + Float { base: Base, empty_exponent: bool }, + /// "'a'", "'\\'", "'''", "';" + Char { terminated: bool }, + /// "b'a'", "b'\\'", "b'''", "b';" + Byte { terminated: bool }, + /// ""abc"", ""abc" + Str { terminated: bool }, + /// "b"abc"", "b"abc" + ByteStr { terminated: bool }, + /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" + RawStr { n_hashes: u16, err: Option }, + /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" + RawByteStr { n_hashes: u16, err: Option }, +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary, + /// Literal starts with "0o". + Octal, + /// Literal starts with "0x". + Hexadecimal, + /// Literal doesn't contain a prefix. + Decimal, +} + +/// Error produced validating a raw string. Represents cases like: +/// - `r##~"abcde"##`: `InvalidStarter` +/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` +/// - Too many `#`s (>65535): `TooManyDelimiters` +// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum RawStrError { + /// Non `#` characters exist between `r` and `"` eg. `r#~"..` + InvalidStarter { bad_char: char }, + /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they + /// may have intended to terminate it. + NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, + /// More than 65535 `#`s exist. + TooManyDelimiters { found: usize }, +} + +pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { + debug_assert!('0' <= cursor.prev() && cursor.prev() <= '9'); + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + let has_digits = match cursor.first() { + 'b' => { + base = Base::Binary; + cursor.bump(); + cursor.eat_decimal_digits() + } + 'o' => { + base = Base::Octal; + cursor.bump(); + cursor.eat_decimal_digits() + } + 'x' => { + base = Base::Hexadecimal; + cursor.bump(); + cursor.eat_hexadecimal_digits() + } + // Not a base prefix. + '0'..='9' | '_' | '.' | 'e' | 'E' => { + cursor.eat_decimal_digits(); + true + } + // Just a 0. + _ => return LiteralKind::Int { base, empty_int: false }, + }; + // Base prefix was provided, but there were no digits + // after it, e.g. "0x". + if !has_digits { + return LiteralKind::Int { base, empty_int: true }; + } + } else { + // No base prefix, parse number in the usual way. + cursor.eat_decimal_digits(); + }; + + match cursor.first() { + // Don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + '.' if cursor.second() != '.' && !is_id_start(cursor.second()) => { + // might have stuff after the ., and if it does, it needs to start + // with a number + cursor.bump(); + let mut empty_exponent = false; + if cursor.first().is_digit(10) { + cursor.eat_decimal_digits(); + match cursor.first() { + 'e' | 'E' => { + cursor.bump(); + empty_exponent = !cursor.eat_float_exponent(); + } + _ => (), + } + } + LiteralKind::Float { base, empty_exponent } + } + 'e' | 'E' => { + cursor.bump(); + let empty_exponent = !cursor.eat_float_exponent(); + LiteralKind::Float { base, empty_exponent } + } + _ => LiteralKind::Int { base, empty_int: false }, + } +} + +pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { + debug_assert!(cursor.prev() == '\''); + + let can_be_a_lifetime = if cursor.second() == '\'' { + // It's surely not a lifetime. + false + } else { + // If the first symbol is valid for identifier, it can be a lifetime. + // Also check if it's a number for a better error reporting (so '0 will + // be reported as invalid lifetime and not as unterminated char literal). + is_id_start(cursor.first()) || cursor.first().is_digit(10) + }; + + if !can_be_a_lifetime { + let terminated = single_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + cursor.eat_literal_suffix(); + } + let kind = LiteralKind::Char { terminated }; + return TokenKind::Literal { kind, suffix_start }; + } + + // Either a lifetime or a character literal with + // length greater than 1. + + let starts_with_number = cursor.first().is_digit(10); + + // Skip the literal contents. + // First symbol can be a number (which isn't a valid identifier start), + // so skip it without any checks. + cursor.bump(); + cursor.eat_while(is_id_continue); + + // Check if after skipping literal contents we've met a closing + // single quote (which means that user attempted to create a + // string with single quotes). + if cursor.first() == '\'' { + cursor.bump(); + let kind = LiteralKind::Char { terminated: true }; + TokenKind::Literal { kind, suffix_start: cursor.len_consumed() } + } else { + TokenKind::Lifetime { starts_with_number } + } +} + +pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { + debug_assert!(cursor.prev() == '\''); + // Check if it's a one-symbol literal. + if cursor.second() == '\'' && cursor.first() != '\\' { + cursor.bump(); + cursor.bump(); + return true; + } + + // Literal has more than one symbol. + + // Parse until either quotes are terminated or error is detected. + loop { + match cursor.first() { + // Quotes are terminated, finish parsing. + '\'' => { + cursor.bump(); + return true; + } + // Probably beginning of the comment, which we don't want to include + // to the error report. + '/' => break, + // Newline without following '\'' means unclosed quote, stop parsing. + '\n' if cursor.second() != '\'' => break, + // End of file, stop parsing. + EOF_CHAR if cursor.is_eof() => break, + // Escaped slash is considered one character, so bump twice. + '\\' => { + cursor.bump(); + cursor.bump(); + } + // Skip the character. + _ => { + cursor.bump(); + } + } + } + // String was not terminated. + false +} + +/// Eats double-quoted string and returns true +/// if string is terminated. +pub(crate) fn double_quoted_string(cursor: &mut Cursor) -> bool { + debug_assert!(cursor.prev() == '"'); + while let Some(c) = cursor.bump() { + match c { + '"' => { + return true; + } + '\\' if cursor.first() == '\\' || cursor.first() == '"' => { + // Bump again to skip escaped character. + cursor.bump(); + } + _ => (), + } + } + // End of file reached. + false +} + +/// Eats the double-quoted string and returns `n_hashes` and an error if encountered. +pub(crate) fn raw_double_quoted_string( + cursor: &mut Cursor, + prefix_len: usize, +) -> (u16, Option) { + // Wrap the actual function to handle the error with too many hashes. + // This way, it eats the whole raw string. + let (n_hashes, err) = raw_string_unvalidated(cursor, prefix_len); + // Only up to 65535 `#`s are allowed in raw strings + match u16::try_from(n_hashes) { + Ok(num) => (num, err), + // We lie about the number of hashes here :P + Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), + } +} + +fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Option) { + debug_assert!(cursor.prev() == 'r'); + let start_pos = cursor.len_consumed(); + let mut possible_terminator_offset = None; + let mut max_hashes = 0; + + // Count opening '#' symbols. + let mut eaten = 0; + while cursor.first() == '#' { + eaten += 1; + cursor.bump(); + } + let n_start_hashes = eaten; + + // Check that string is started. + match cursor.bump() { + Some('"') => (), + c => { + let c = c.unwrap_or(EOF_CHAR); + return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); + } + } + + // Skip the string contents and on each '#' character met, check if this is + // a raw string termination. + loop { + cursor.eat_while(|c| c != '"'); + + if cursor.is_eof() { + return ( + n_start_hashes, + Some(RawStrError::NoTerminator { + expected: n_start_hashes, + found: max_hashes, + possible_terminator_offset, + }), + ); + } + + // Eat closing double quote. + cursor.bump(); + + // Check that amount of closing '#' symbols + // is equal to the amount of opening ones. + // Note that this will not consume extra trailing `#` characters: + // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` + // followed by a `#` token. + let mut n_end_hashes = 0; + while cursor.first() == '#' && n_end_hashes < n_start_hashes { + n_end_hashes += 1; + cursor.bump(); + } + + if n_end_hashes == n_start_hashes { + return (n_start_hashes, None); + } else if n_end_hashes > max_hashes { + // Keep track of possible terminators to give a hint about + // where there might be a missing terminator + possible_terminator_offset = + Some(cursor.len_consumed() - start_pos - n_end_hashes + prefix_len); + max_hashes = n_end_hashes; + } + } +} From a534bd71a32abe837f68496d051ada75022a291b Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Wed, 3 Mar 2021 22:34:57 +0100 Subject: [PATCH 2/6] Move the `eat_*_digits()` methods to `literals.rs`. --- compiler/rustc_lexer/src/cursor.rs | 7 +++ compiler/rustc_lexer/src/lib.rs | 74 ++++------------------------ compiler/rustc_lexer/src/literals.rs | 67 +++++++++++++++++++++---- 3 files changed, 74 insertions(+), 74 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 297f3d19ca178..e188102c8e580 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -81,4 +81,11 @@ impl<'a> Cursor<'a> { Some(c) } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + while predicate(self.first()) && !self.is_eof() { + self.bump(); + } + } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 366e0771320a8..9855cf13092fb 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -31,8 +31,8 @@ mod tests; use self::TokenKind::*; use crate::cursor::Cursor; use crate::literals::{ - double_quoted_string, lifetime_or_char, number, raw_double_quoted_string, single_quoted_string, - LiteralKind, + double_quoted_string, eat_literal_suffix, lifetime_or_char, number, raw_double_quoted_string, + single_quoted_string, LiteralKind, }; /// Parsed token. @@ -272,7 +272,7 @@ impl Cursor<'_> { let (n_hashes, err) = raw_double_quoted_string(self, 1); let suffix_start = self.len_consumed(); if err.is_none() { - self.eat_literal_suffix(); + eat_literal_suffix(self); } let kind = LiteralKind::RawStr { n_hashes, err }; Literal { kind, suffix_start } @@ -287,7 +287,7 @@ impl Cursor<'_> { let terminated = single_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { - self.eat_literal_suffix(); + eat_literal_suffix(self); } let kind = LiteralKind::Byte { terminated }; Literal { kind, suffix_start } @@ -297,7 +297,7 @@ impl Cursor<'_> { let terminated = double_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { - self.eat_literal_suffix(); + eat_literal_suffix(self); } let kind = LiteralKind::ByteStr { terminated }; Literal { kind, suffix_start } @@ -307,7 +307,7 @@ impl Cursor<'_> { let (n_hashes, err) = raw_double_quoted_string(self, 2); let suffix_start = self.len_consumed(); if err.is_none() { - self.eat_literal_suffix(); + eat_literal_suffix(self); } let kind = LiteralKind::RawByteStr { n_hashes, err }; Literal { kind, suffix_start } @@ -323,7 +323,7 @@ impl Cursor<'_> { c @ '0'..='9' => { let literal_kind = number(self, c); let suffix_start = self.len_consumed(); - self.eat_literal_suffix(); + eat_literal_suffix(self); TokenKind::Literal { kind: literal_kind, suffix_start } } @@ -363,7 +363,7 @@ impl Cursor<'_> { let terminated = double_quoted_string(self); let suffix_start = self.len_consumed(); if terminated { - self.eat_literal_suffix(); + eat_literal_suffix(self); } let kind = LiteralKind::Str { terminated }; Literal { kind, suffix_start } @@ -448,56 +448,7 @@ impl Cursor<'_> { Ident } - fn eat_decimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' => { - has_digits = true; - self.bump(); - } - _ => break, - } - } - has_digits - } - - fn eat_hexadecimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' | 'a'..='f' | 'A'..='F' => { - has_digits = true; - self.bump(); - } - _ => break, - } - } - has_digits - } - - /// Eats the float exponent. Returns true if at least one digit was met, - /// and returns false otherwise. - fn eat_float_exponent(&mut self) -> bool { - debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - if self.first() == '-' || self.first() == '+' { - self.bump(); - } - self.eat_decimal_digits() - } - - // Eats the suffix of the literal, e.g. "_u8". - fn eat_literal_suffix(&mut self) { - self.eat_identifier(); - } - - // Eats the identifier. + /// Eats one identifier. fn eat_identifier(&mut self) { if !is_id_start(self.first()) { return; @@ -506,11 +457,4 @@ impl Cursor<'_> { self.eat_while(is_id_continue); } - - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } } diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs index 2bfa5f9228b84..401b4a5df653d 100644 --- a/compiler/rustc_lexer/src/literals.rs +++ b/compiler/rustc_lexer/src/literals.rs @@ -60,21 +60,21 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { 'b' => { base = Base::Binary; cursor.bump(); - cursor.eat_decimal_digits() + eat_decimal_digits(cursor) } 'o' => { base = Base::Octal; cursor.bump(); - cursor.eat_decimal_digits() + eat_decimal_digits(cursor) } 'x' => { base = Base::Hexadecimal; cursor.bump(); - cursor.eat_hexadecimal_digits() + eat_hexadecimal_digits(cursor) } // Not a base prefix. '0'..='9' | '_' | '.' | 'e' | 'E' => { - cursor.eat_decimal_digits(); + eat_decimal_digits(cursor); true } // Just a 0. @@ -87,7 +87,7 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { } } else { // No base prefix, parse number in the usual way. - cursor.eat_decimal_digits(); + eat_decimal_digits(cursor); }; match cursor.first() { @@ -100,11 +100,11 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { cursor.bump(); let mut empty_exponent = false; if cursor.first().is_digit(10) { - cursor.eat_decimal_digits(); + eat_decimal_digits(cursor); match cursor.first() { 'e' | 'E' => { cursor.bump(); - empty_exponent = !cursor.eat_float_exponent(); + empty_exponent = !eat_float_exponent(cursor); } _ => (), } @@ -113,13 +113,57 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { } 'e' | 'E' => { cursor.bump(); - let empty_exponent = !cursor.eat_float_exponent(); + let empty_exponent = !eat_float_exponent(cursor); LiteralKind::Float { base, empty_exponent } } _ => LiteralKind::Int { base, empty_int: false }, } } +pub(crate) fn eat_decimal_digits(cursor: &mut Cursor) -> bool { + let mut has_digits = false; + loop { + match cursor.first() { + '_' => { + cursor.bump(); + } + '0'..='9' => { + has_digits = true; + cursor.bump(); + } + _ => break, + } + } + has_digits +} + +pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool { + let mut has_digits = false; + loop { + match cursor.first() { + '_' => { + cursor.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + cursor.bump(); + } + _ => break, + } + } + has_digits +} + +/// Eats the float exponent. Returns true if at least one digit was met, +/// and returns false otherwise. +fn eat_float_exponent(cursor: &mut Cursor) -> bool { + debug_assert!(cursor.prev() == 'e' || cursor.prev() == 'E'); + if cursor.first() == '-' || cursor.first() == '+' { + cursor.bump(); + } + eat_decimal_digits(cursor) +} + pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { debug_assert!(cursor.prev() == '\''); @@ -137,7 +181,7 @@ pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { let terminated = single_quoted_string(cursor); let suffix_start = cursor.len_consumed(); if terminated { - cursor.eat_literal_suffix(); + eat_literal_suffix(cursor); } let kind = LiteralKind::Char { terminated }; return TokenKind::Literal { kind, suffix_start }; @@ -307,3 +351,8 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt } } } + +/// Eats the suffix of a literal, e.g. "_u8". +pub(crate) fn eat_literal_suffix(cursor: &mut Cursor) { + cursor.eat_identifier(); +} From 70a4bc89c70d1cba8d6bd2f961f3156962efab7a Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Wed, 3 Mar 2021 23:06:03 +0100 Subject: [PATCH 3/6] Make `advance_token` a freestanding function. This has better separation of concern between the lexing and the Cursor's iterator-like functionality. --- compiler/rustc_lexer/src/lib.rs | 372 +++++++++++++-------------- compiler/rustc_lexer/src/literals.rs | 4 +- 2 files changed, 187 insertions(+), 189 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 9855cf13092fb..a05a053324519 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -168,7 +168,7 @@ pub fn strip_shebang(input: &str) -> Option { /// Parses the first token from the provided input string. pub fn first_token(input: &str) -> Token { debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() + advance_token(&mut Cursor::new(input)) } /// Creates an iterator that produces tokens from the input string. @@ -250,211 +250,209 @@ pub fn is_ident(string: &str) -> bool { } } -impl Cursor<'_> { - /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); - let token_kind = match first_char { - // Slash, comment or block comment. - '/' => match self.first() { - '/' => self.line_comment(), - '*' => self.block_comment(), - _ => Slash, - }, - - // Whitespace sequence. - c if is_whitespace(c) => self.whitespace(), - - // Raw identifier, raw string literal or identifier. - 'r' => match (self.first(), self.second()) { - ('#', c1) if is_id_start(c1) => self.raw_ident(), - ('#', _) | ('"', _) => { - let (n_hashes, err) = raw_double_quoted_string(self, 1); - let suffix_start = self.len_consumed(); - if err.is_none() { - eat_literal_suffix(self); - } - let kind = LiteralKind::RawStr { n_hashes, err }; - Literal { kind, suffix_start } +/// Parses a token from the input string. +fn advance_token(cursor: &mut Cursor) -> Token { + let first_char = cursor.bump().unwrap(); + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match cursor.first() { + '/' => line_comment(cursor), + '*' => block_comment(cursor), + _ => Slash, + }, + + // Whitespace sequence. + c if is_whitespace(c) => whitespace(cursor), + + // Raw identifier, raw string literal or identifier. + 'r' => match (cursor.first(), cursor.second()) { + ('#', c1) if is_id_start(c1) => raw_ident(cursor), + ('#', _) | ('"', _) => { + let (n_hashes, err) = raw_double_quoted_string(cursor, 1); + let suffix_start = cursor.len_consumed(); + if err.is_none() { + eat_literal_suffix(cursor); } - _ => self.ident(), - }, - - // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.first(), self.second()) { - ('\'', _) => { - self.bump(); - let terminated = single_quoted_string(self); - let suffix_start = self.len_consumed(); - if terminated { - eat_literal_suffix(self); - } - let kind = LiteralKind::Byte { terminated }; - Literal { kind, suffix_start } - } - ('"', _) => { - self.bump(); - let terminated = double_quoted_string(self); - let suffix_start = self.len_consumed(); - if terminated { - eat_literal_suffix(self); - } - let kind = LiteralKind::ByteStr { terminated }; - Literal { kind, suffix_start } - } - ('r', '"') | ('r', '#') => { - self.bump(); - let (n_hashes, err) = raw_double_quoted_string(self, 2); - let suffix_start = self.len_consumed(); - if err.is_none() { - eat_literal_suffix(self); - } - let kind = LiteralKind::RawByteStr { n_hashes, err }; - Literal { kind, suffix_start } + let kind = LiteralKind::RawStr { n_hashes, err }; + Literal { kind, suffix_start } + } + _ => ident(cursor), + }, + + // Byte literal, byte string literal, raw byte string literal or identifier. + 'b' => match (cursor.first(), cursor.second()) { + ('\'', _) => { + cursor.bump(); + let terminated = single_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + eat_literal_suffix(cursor); } - _ => self.ident(), - }, - - // Identifier (this should be checked after other variant that can - // start as identifier). - c if is_id_start(c) => self.ident(), - - // Numeric literal. - c @ '0'..='9' => { - let literal_kind = number(self, c); - let suffix_start = self.len_consumed(); - eat_literal_suffix(self); - TokenKind::Literal { kind: literal_kind, suffix_start } + let kind = LiteralKind::Byte { terminated }; + Literal { kind, suffix_start } } - - // One-symbol tokens. - ';' => Semi, - ',' => Comma, - '.' => Dot, - '(' => OpenParen, - ')' => CloseParen, - '{' => OpenBrace, - '}' => CloseBrace, - '[' => OpenBracket, - ']' => CloseBracket, - '@' => At, - '#' => Pound, - '~' => Tilde, - '?' => Question, - ':' => Colon, - '$' => Dollar, - '=' => Eq, - '!' => Bang, - '<' => Lt, - '>' => Gt, - '-' => Minus, - '&' => And, - '|' => Or, - '+' => Plus, - '*' => Star, - '^' => Caret, - '%' => Percent, - - // Lifetime or character literal. - '\'' => lifetime_or_char(self), - - // String literal. - '"' => { - let terminated = double_quoted_string(self); - let suffix_start = self.len_consumed(); + ('"', _) => { + cursor.bump(); + let terminated = double_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); if terminated { - eat_literal_suffix(self); + eat_literal_suffix(cursor); } - let kind = LiteralKind::Str { terminated }; + let kind = LiteralKind::ByteStr { terminated }; Literal { kind, suffix_start } } - _ => Unknown, - }; - Token::new(token_kind, self.len_consumed()) - } + ('r', '"') | ('r', '#') => { + cursor.bump(); + let (n_hashes, err) = raw_double_quoted_string(cursor, 2); + let suffix_start = cursor.len_consumed(); + if err.is_none() { + eat_literal_suffix(cursor); + } + let kind = LiteralKind::RawByteStr { n_hashes, err }; + Literal { kind, suffix_start } + } + _ => ident(cursor), + }, + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_id_start(c) => ident(cursor), + + // Numeric literal. + c @ '0'..='9' => { + let literal_kind = number(cursor, c); + let suffix_start = cursor.len_consumed(); + eat_literal_suffix(cursor); + TokenKind::Literal { kind: literal_kind, suffix_start } + } + + // One-symbol tokens. + ';' => Semi, + ',' => Comma, + '.' => Dot, + '(' => OpenParen, + ')' => CloseParen, + '{' => OpenBrace, + '}' => CloseBrace, + '[' => OpenBracket, + ']' => CloseBracket, + '@' => At, + '#' => Pound, + '~' => Tilde, + '?' => Question, + ':' => Colon, + '$' => Dollar, + '=' => Eq, + '!' => Bang, + '<' => Lt, + '>' => Gt, + '-' => Minus, + '&' => And, + '|' => Or, + '+' => Plus, + '*' => Star, + '^' => Caret, + '%' => Percent, + + // Lifetime or character literal. + '\'' => lifetime_or_char(cursor), + + // String literal. + '"' => { + let terminated = double_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + eat_literal_suffix(cursor); + } + let kind = LiteralKind::Str { terminated }; + Literal { kind, suffix_start } + } + _ => Unknown, + }; + Token::new(token_kind, cursor.len_consumed()) +} - fn line_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.first() == '/'); - self.bump(); +fn line_comment(cursor: &mut Cursor) -> TokenKind { + debug_assert!(cursor.prev() == '/' && cursor.first() == '/'); + cursor.bump(); - let doc_style = match self.first() { - // `//!` is an inner line doc comment. - '!' => Some(DocStyle::Inner), - // `////` (more than 3 slashes) is not considered a doc comment. - '/' if self.second() != '/' => Some(DocStyle::Outer), - _ => None, - }; + let doc_style = match cursor.first() { + // `//!` is an inner line doc comment. + '!' => Some(DocStyle::Inner), + // `////` (more than 3 slashes) is not considered a doc comment. + '/' if cursor.second() != '/' => Some(DocStyle::Outer), + _ => None, + }; - self.eat_while(|c| c != '\n'); - LineComment { doc_style } - } + cursor.eat_while(|c| c != '\n'); + LineComment { doc_style } +} - fn block_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.first() == '*'); - self.bump(); - - let doc_style = match self.first() { - // `/*!` is an inner block doc comment. - '!' => Some(DocStyle::Inner), - // `/***` (more than 2 stars) is not considered a doc comment. - // `/**/` is not considered a doc comment. - '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), - _ => None, - }; - - let mut depth = 1usize; - while let Some(c) = self.bump() { - match c { - '/' if self.first() == '*' => { - self.bump(); - depth += 1; - } - '*' if self.first() == '/' => { - self.bump(); - depth -= 1; - if depth == 0 { - // This block comment is closed, so for a construction like "/* */ */" - // there will be a successfully parsed block comment "/* */" - // and " */" will be processed separately. - break; - } +fn block_comment(cursor: &mut Cursor) -> TokenKind { + debug_assert!(cursor.prev() == '/' && cursor.first() == '*'); + cursor.bump(); + + let doc_style = match cursor.first() { + // `/*!` is an inner block doc comment. + '!' => Some(DocStyle::Inner), + // `/***` (more than 2 stars) is not considered a doc comment. + // `/**/` is not considered a doc comment. + '*' if !matches!(cursor.second(), '*' | '/') => Some(DocStyle::Outer), + _ => None, + }; + + let mut depth = 1usize; + while let Some(c) = cursor.bump() { + match c { + '/' if cursor.first() == '*' => { + cursor.bump(); + depth += 1; + } + '*' if cursor.first() == '/' => { + cursor.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; } - _ => (), } + _ => (), } - - BlockComment { doc_style, terminated: depth == 0 } } - fn whitespace(&mut self) -> TokenKind { - debug_assert!(is_whitespace(self.prev())); - self.eat_while(is_whitespace); - Whitespace - } + BlockComment { doc_style, terminated: depth == 0 } +} - fn raw_ident(&mut self) -> TokenKind { - debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); - // Eat "#" symbol. - self.bump(); - // Eat the identifier part of RawIdent. - self.eat_identifier(); - RawIdent - } +fn whitespace(cursor: &mut Cursor) -> TokenKind { + debug_assert!(is_whitespace(cursor.prev())); + cursor.eat_while(is_whitespace); + Whitespace +} - fn ident(&mut self) -> TokenKind { - debug_assert!(is_id_start(self.prev())); - // Start is already eaten, eat the rest of identifier. - self.eat_while(is_id_continue); - Ident - } +fn raw_ident(cursor: &mut Cursor) -> TokenKind { + debug_assert!(cursor.prev() == 'r' && cursor.first() == '#' && is_id_start(cursor.second())); + // Eat "#" symbol. + cursor.bump(); + // Eat the identifier part of RawIdent. + eat_identifier(cursor); + RawIdent +} - /// Eats one identifier. - fn eat_identifier(&mut self) { - if !is_id_start(self.first()) { - return; - } - self.bump(); +fn ident(cursor: &mut Cursor) -> TokenKind { + debug_assert!(is_id_start(cursor.prev())); + // Start is already eaten, eat the rest of identifier. + cursor.eat_while(is_id_continue); + Ident +} - self.eat_while(is_id_continue); +/// Eats one identifier. +pub(crate) fn eat_identifier(cursor: &mut Cursor) { + if !is_id_start(cursor.first()) { + return; } + cursor.bump(); + + cursor.eat_while(is_id_continue); } diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs index 401b4a5df653d..02adbd77cb3df 100644 --- a/compiler/rustc_lexer/src/literals.rs +++ b/compiler/rustc_lexer/src/literals.rs @@ -1,5 +1,5 @@ use crate::cursor::{Cursor, EOF_CHAR}; -use crate::{is_id_continue, is_id_start, TokenKind}; +use crate::{is_id_continue, is_id_start, TokenKind, eat_identifier}; use std::convert::TryFrom; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -354,5 +354,5 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt /// Eats the suffix of a literal, e.g. "_u8". pub(crate) fn eat_literal_suffix(cursor: &mut Cursor) { - cursor.eat_identifier(); + eat_identifier(cursor); } From 629e161527a2edc0cbf2e2fa9cbe1f069309e230 Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Wed, 3 Mar 2021 23:37:50 +0100 Subject: [PATCH 4/6] Simplified `cursor.rs` a bit and renamed `first` to `peek` and `eat_while` to `bump_while`. --- compiler/rustc_lexer/src/cursor.rs | 28 +++++++------------ compiler/rustc_lexer/src/lib.rs | 34 +++++++++++------------ compiler/rustc_lexer/src/literals.rs | 40 ++++++++++++++-------------- 3 files changed, 46 insertions(+), 56 deletions(-) diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index e188102c8e580..5110d7a109aaa 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -2,7 +2,7 @@ use std::str::Chars; /// Peekable iterator over a char sequence. /// -/// Next characters can be peeked via `nth_char` method, +/// Next characters can be peeked via `peek` method, /// and position can be shifted forward via `bump` method. pub(crate) struct Cursor<'a> { initial_len: usize, @@ -37,22 +37,17 @@ impl<'a> Cursor<'a> { } } - /// Returns nth character relative to the current cursor position. - /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// Peeks the next symbol from the input stream without consuming it. + /// If it doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - fn nth_char(&self, n: usize) -> char { - self.chars().nth(n).unwrap_or(EOF_CHAR) - } - - /// Peeks the next symbol from the input stream without consuming it. - pub(crate) fn first(&self) -> char { - self.nth_char(0) + pub(crate) fn peek(&self) -> char { + self.chars.clone().nth(0).unwrap_or(EOF_CHAR) } /// Peeks the second symbol from the input stream without consuming it. - pub(crate) fn second(&self) -> char { - self.nth_char(1) + pub(crate) fn peek_second(&self) -> char { + self.chars.clone().nth(1).unwrap_or(EOF_CHAR) } /// Checks if there is nothing more to consume. @@ -65,11 +60,6 @@ impl<'a> Cursor<'a> { self.initial_len - self.chars.as_str().len() } - /// Returns a `Chars` iterator over the remaining characters. - fn chars(&self) -> Chars<'a> { - self.chars.clone() - } - /// Moves to the next character. pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; @@ -83,8 +73,8 @@ impl<'a> Cursor<'a> { } /// Eats symbols while predicate returns true or until the end of file is reached. - pub(crate) fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { + pub(crate) fn bump_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + while predicate(self.peek()) && !self.is_eof() { self.bump(); } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index a05a053324519..2f934f16a7cd4 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -255,7 +255,7 @@ fn advance_token(cursor: &mut Cursor) -> Token { let first_char = cursor.bump().unwrap(); let token_kind = match first_char { // Slash, comment or block comment. - '/' => match cursor.first() { + '/' => match cursor.peek() { '/' => line_comment(cursor), '*' => block_comment(cursor), _ => Slash, @@ -265,7 +265,7 @@ fn advance_token(cursor: &mut Cursor) -> Token { c if is_whitespace(c) => whitespace(cursor), // Raw identifier, raw string literal or identifier. - 'r' => match (cursor.first(), cursor.second()) { + 'r' => match (cursor.peek(), cursor.peek_second()) { ('#', c1) if is_id_start(c1) => raw_ident(cursor), ('#', _) | ('"', _) => { let (n_hashes, err) = raw_double_quoted_string(cursor, 1); @@ -280,7 +280,7 @@ fn advance_token(cursor: &mut Cursor) -> Token { }, // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (cursor.first(), cursor.second()) { + 'b' => match (cursor.peek(), cursor.peek_second()) { ('\'', _) => { cursor.bump(); let terminated = single_quoted_string(cursor); @@ -373,42 +373,42 @@ fn advance_token(cursor: &mut Cursor) -> Token { } fn line_comment(cursor: &mut Cursor) -> TokenKind { - debug_assert!(cursor.prev() == '/' && cursor.first() == '/'); + debug_assert!(cursor.prev() == '/' && cursor.peek() == '/'); cursor.bump(); - let doc_style = match cursor.first() { + let doc_style = match cursor.peek() { // `//!` is an inner line doc comment. '!' => Some(DocStyle::Inner), // `////` (more than 3 slashes) is not considered a doc comment. - '/' if cursor.second() != '/' => Some(DocStyle::Outer), + '/' if cursor.peek_second() != '/' => Some(DocStyle::Outer), _ => None, }; - cursor.eat_while(|c| c != '\n'); + cursor.bump_while(|c| c != '\n'); LineComment { doc_style } } fn block_comment(cursor: &mut Cursor) -> TokenKind { - debug_assert!(cursor.prev() == '/' && cursor.first() == '*'); + debug_assert!(cursor.prev() == '/' && cursor.peek() == '*'); cursor.bump(); - let doc_style = match cursor.first() { + let doc_style = match cursor.peek() { // `/*!` is an inner block doc comment. '!' => Some(DocStyle::Inner), // `/***` (more than 2 stars) is not considered a doc comment. // `/**/` is not considered a doc comment. - '*' if !matches!(cursor.second(), '*' | '/') => Some(DocStyle::Outer), + '*' if !matches!(cursor.peek_second(), '*' | '/') => Some(DocStyle::Outer), _ => None, }; let mut depth = 1usize; while let Some(c) = cursor.bump() { match c { - '/' if cursor.first() == '*' => { + '/' if cursor.peek() == '*' => { cursor.bump(); depth += 1; } - '*' if cursor.first() == '/' => { + '*' if cursor.peek() == '/' => { cursor.bump(); depth -= 1; if depth == 0 { @@ -427,12 +427,12 @@ fn block_comment(cursor: &mut Cursor) -> TokenKind { fn whitespace(cursor: &mut Cursor) -> TokenKind { debug_assert!(is_whitespace(cursor.prev())); - cursor.eat_while(is_whitespace); + cursor.bump_while(is_whitespace); Whitespace } fn raw_ident(cursor: &mut Cursor) -> TokenKind { - debug_assert!(cursor.prev() == 'r' && cursor.first() == '#' && is_id_start(cursor.second())); + debug_assert!(cursor.prev() == 'r' && cursor.peek() == '#' && is_id_start(cursor.peek_second())); // Eat "#" symbol. cursor.bump(); // Eat the identifier part of RawIdent. @@ -443,16 +443,16 @@ fn raw_ident(cursor: &mut Cursor) -> TokenKind { fn ident(cursor: &mut Cursor) -> TokenKind { debug_assert!(is_id_start(cursor.prev())); // Start is already eaten, eat the rest of identifier. - cursor.eat_while(is_id_continue); + cursor.bump_while(is_id_continue); Ident } /// Eats one identifier. pub(crate) fn eat_identifier(cursor: &mut Cursor) { - if !is_id_start(cursor.first()) { + if !is_id_start(cursor.peek()) { return; } cursor.bump(); - cursor.eat_while(is_id_continue); + cursor.bump_while(is_id_continue); } diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs index 02adbd77cb3df..daf231bceb39e 100644 --- a/compiler/rustc_lexer/src/literals.rs +++ b/compiler/rustc_lexer/src/literals.rs @@ -56,7 +56,7 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { let mut base = Base::Decimal; if first_digit == '0' { // Attempt to parse encoding base. - let has_digits = match cursor.first() { + let has_digits = match cursor.peek() { 'b' => { base = Base::Binary; cursor.bump(); @@ -90,18 +90,18 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { eat_decimal_digits(cursor); }; - match cursor.first() { + match cursor.peek() { // Don't be greedy if this is actually an // integer literal followed by field/method access or a range pattern // (`0..2` and `12.foo()`) - '.' if cursor.second() != '.' && !is_id_start(cursor.second()) => { + '.' if cursor.peek_second() != '.' && !is_id_start(cursor.peek_second()) => { // might have stuff after the ., and if it does, it needs to start // with a number cursor.bump(); let mut empty_exponent = false; - if cursor.first().is_digit(10) { + if cursor.peek().is_digit(10) { eat_decimal_digits(cursor); - match cursor.first() { + match cursor.peek() { 'e' | 'E' => { cursor.bump(); empty_exponent = !eat_float_exponent(cursor); @@ -123,7 +123,7 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { pub(crate) fn eat_decimal_digits(cursor: &mut Cursor) -> bool { let mut has_digits = false; loop { - match cursor.first() { + match cursor.peek() { '_' => { cursor.bump(); } @@ -140,7 +140,7 @@ pub(crate) fn eat_decimal_digits(cursor: &mut Cursor) -> bool { pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool { let mut has_digits = false; loop { - match cursor.first() { + match cursor.peek() { '_' => { cursor.bump(); } @@ -158,7 +158,7 @@ pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool { /// and returns false otherwise. fn eat_float_exponent(cursor: &mut Cursor) -> bool { debug_assert!(cursor.prev() == 'e' || cursor.prev() == 'E'); - if cursor.first() == '-' || cursor.first() == '+' { + if cursor.peek() == '-' || cursor.peek() == '+' { cursor.bump(); } eat_decimal_digits(cursor) @@ -167,14 +167,14 @@ fn eat_float_exponent(cursor: &mut Cursor) -> bool { pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { debug_assert!(cursor.prev() == '\''); - let can_be_a_lifetime = if cursor.second() == '\'' { + let can_be_a_lifetime = if cursor.peek_second() == '\'' { // It's surely not a lifetime. false } else { // If the first symbol is valid for identifier, it can be a lifetime. // Also check if it's a number for a better error reporting (so '0 will // be reported as invalid lifetime and not as unterminated char literal). - is_id_start(cursor.first()) || cursor.first().is_digit(10) + is_id_start(cursor.peek()) || cursor.peek().is_digit(10) }; if !can_be_a_lifetime { @@ -190,18 +190,18 @@ pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { // Either a lifetime or a character literal with // length greater than 1. - let starts_with_number = cursor.first().is_digit(10); + let starts_with_number = cursor.peek().is_digit(10); // Skip the literal contents. // First symbol can be a number (which isn't a valid identifier start), // so skip it without any checks. cursor.bump(); - cursor.eat_while(is_id_continue); + cursor.bump_while(is_id_continue); // Check if after skipping literal contents we've met a closing // single quote (which means that user attempted to create a // string with single quotes). - if cursor.first() == '\'' { + if cursor.peek() == '\'' { cursor.bump(); let kind = LiteralKind::Char { terminated: true }; TokenKind::Literal { kind, suffix_start: cursor.len_consumed() } @@ -213,7 +213,7 @@ pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { debug_assert!(cursor.prev() == '\''); // Check if it's a one-symbol literal. - if cursor.second() == '\'' && cursor.first() != '\\' { + if cursor.peek_second() == '\'' && cursor.peek() != '\\' { cursor.bump(); cursor.bump(); return true; @@ -223,7 +223,7 @@ pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { // Parse until either quotes are terminated or error is detected. loop { - match cursor.first() { + match cursor.peek() { // Quotes are terminated, finish parsing. '\'' => { cursor.bump(); @@ -233,7 +233,7 @@ pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { // to the error report. '/' => break, // Newline without following '\'' means unclosed quote, stop parsing. - '\n' if cursor.second() != '\'' => break, + '\n' if cursor.peek_second() != '\'' => break, // End of file, stop parsing. EOF_CHAR if cursor.is_eof() => break, // Escaped slash is considered one character, so bump twice. @@ -260,7 +260,7 @@ pub(crate) fn double_quoted_string(cursor: &mut Cursor) -> bool { '"' => { return true; } - '\\' if cursor.first() == '\\' || cursor.first() == '"' => { + '\\' if cursor.peek() == '\\' || cursor.peek() == '"' => { // Bump again to skip escaped character. cursor.bump(); } @@ -295,7 +295,7 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt // Count opening '#' symbols. let mut eaten = 0; - while cursor.first() == '#' { + while cursor.peek() == '#' { eaten += 1; cursor.bump(); } @@ -313,7 +313,7 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt // Skip the string contents and on each '#' character met, check if this is // a raw string termination. loop { - cursor.eat_while(|c| c != '"'); + cursor.bump_while(|c| c != '"'); if cursor.is_eof() { return ( @@ -335,7 +335,7 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` // followed by a `#` token. let mut n_end_hashes = 0; - while cursor.first() == '#' && n_end_hashes < n_start_hashes { + while cursor.peek() == '#' && n_end_hashes < n_start_hashes { n_end_hashes += 1; cursor.bump(); } From d4336a52a2b18a844c844aab5f54efff60053502 Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Thu, 4 Mar 2021 10:14:11 +0100 Subject: [PATCH 5/6] Inline some helper functions that are only used once, to lower the number of things I need to keep in my head. And fix imports in tests.rs. --- compiler/rustc_lexer/src/lib.rs | 60 ++++++++++------------------ compiler/rustc_lexer/src/literals.rs | 11 +++-- compiler/rustc_lexer/src/tests.rs | 9 +++-- 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 2f934f16a7cd4..9ad70ce8b7623 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -28,11 +28,13 @@ pub mod unescape; #[cfg(test)] mod tests; +pub use crate::literals::{Base, LiteralKind, RawStrError}; + use self::TokenKind::*; use crate::cursor::Cursor; use crate::literals::{ double_quoted_string, eat_literal_suffix, lifetime_or_char, number, raw_double_quoted_string, - single_quoted_string, LiteralKind, + single_quoted_string, }; /// Parsed token. @@ -165,12 +167,6 @@ pub fn strip_shebang(input: &str) -> Option { None } -/// Parses the first token from the provided input string. -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - advance_token(&mut Cursor::new(input)) -} - /// Creates an iterator that produces tokens from the input string. pub fn tokenize(mut input: &str) -> impl Iterator + '_ { std::iter::from_fn(move || { @@ -250,8 +246,11 @@ pub fn is_ident(string: &str) -> bool { } } -/// Parses a token from the input string. -fn advance_token(cursor: &mut Cursor) -> Token { +/// Parses the first token from the provided input string. +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + let cursor = &mut Cursor::new(input); + let first_char = cursor.bump().unwrap(); let token_kind = match first_char { // Slash, comment or block comment. @@ -262,11 +261,21 @@ fn advance_token(cursor: &mut Cursor) -> Token { }, // Whitespace sequence. - c if is_whitespace(c) => whitespace(cursor), + c if is_whitespace(c) => { + cursor.bump_while(is_whitespace); + Whitespace + } // Raw identifier, raw string literal or identifier. 'r' => match (cursor.peek(), cursor.peek_second()) { - ('#', c1) if is_id_start(c1) => raw_ident(cursor), + ('#', c1) if is_id_start(c1) => { + // Eat "#" symbol. + cursor.bump(); + // Eat the identifier part of RawIdent. + cursor.bump(); + ident(cursor); + RawIdent + } ('#', _) | ('"', _) => { let (n_hashes, err) = raw_double_quoted_string(cursor, 1); let suffix_start = cursor.len_consumed(); @@ -425,34 +434,9 @@ fn block_comment(cursor: &mut Cursor) -> TokenKind { BlockComment { doc_style, terminated: depth == 0 } } -fn whitespace(cursor: &mut Cursor) -> TokenKind { - debug_assert!(is_whitespace(cursor.prev())); - cursor.bump_while(is_whitespace); - Whitespace -} - -fn raw_ident(cursor: &mut Cursor) -> TokenKind { - debug_assert!(cursor.prev() == 'r' && cursor.peek() == '#' && is_id_start(cursor.peek_second())); - // Eat "#" symbol. - cursor.bump(); - // Eat the identifier part of RawIdent. - eat_identifier(cursor); - RawIdent -} - -fn ident(cursor: &mut Cursor) -> TokenKind { +/// Start is already eaten, eat the rest of identifier. +pub(crate) fn ident(cursor: &mut Cursor) -> TokenKind { debug_assert!(is_id_start(cursor.prev())); - // Start is already eaten, eat the rest of identifier. cursor.bump_while(is_id_continue); Ident } - -/// Eats one identifier. -pub(crate) fn eat_identifier(cursor: &mut Cursor) { - if !is_id_start(cursor.peek()) { - return; - } - cursor.bump(); - - cursor.bump_while(is_id_continue); -} diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs index daf231bceb39e..5aa0bac31681e 100644 --- a/compiler/rustc_lexer/src/literals.rs +++ b/compiler/rustc_lexer/src/literals.rs @@ -1,5 +1,5 @@ use crate::cursor::{Cursor, EOF_CHAR}; -use crate::{is_id_continue, is_id_start, TokenKind, eat_identifier}; +use crate::{ident, is_id_continue, is_id_start, TokenKind}; use std::convert::TryFrom; #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -279,7 +279,8 @@ pub(crate) fn raw_double_quoted_string( // Wrap the actual function to handle the error with too many hashes. // This way, it eats the whole raw string. let (n_hashes, err) = raw_string_unvalidated(cursor, prefix_len); - // Only up to 65535 `#`s are allowed in raw strings + + // Only up to 65535 `#`s are allowed in raw strings. match u16::try_from(n_hashes) { Ok(num) => (num, err), // We lie about the number of hashes here :P @@ -354,5 +355,9 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt /// Eats the suffix of a literal, e.g. "_u8". pub(crate) fn eat_literal_suffix(cursor: &mut Cursor) { - eat_identifier(cursor); + // Eats one identifier. + if is_id_start(cursor.peek()) { + cursor.bump(); + ident(cursor); + } } diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs index 94017b7b286e2..8f1a968526609 100644 --- a/compiler/rustc_lexer/src/tests.rs +++ b/compiler/rustc_lexer/src/tests.rs @@ -1,12 +1,13 @@ -use super::*; - +use crate::cursor::Cursor; +use crate::literals::{raw_double_quoted_string, RawStrError}; +use crate::{strip_shebang, tokenize}; use expect_test::{expect, Expect}; fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option) { let s = &format!("r{}", s); - let mut cursor = Cursor::new(s); + let cursor = &mut Cursor::new(s); cursor.bump(); - let (n_hashes, err) = cursor.raw_double_quoted_string(0); + let (n_hashes, err) = raw_double_quoted_string(cursor, 0); assert_eq!(n_hashes, expected_hashes); assert_eq!(err, expected_err); } From 7149a21c54730d21185fecf7a5cd324c7b555056 Mon Sep 17 00:00:00 2001 From: Julian Wollersberger Date: Thu, 4 Mar 2021 11:13:17 +0100 Subject: [PATCH 6/6] Address the "Hidden lifetime in path" warning. This one wasn't shown with `cargo check`. --- compiler/rustc_lexer/src/lib.rs | 6 +++--- compiler/rustc_lexer/src/literals.rs | 23 +++++++++++++---------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 9ad70ce8b7623..dacda8b3ca7a0 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -381,7 +381,7 @@ pub fn first_token(input: &str) -> Token { Token::new(token_kind, cursor.len_consumed()) } -fn line_comment(cursor: &mut Cursor) -> TokenKind { +fn line_comment(cursor: &mut Cursor<'_>) -> TokenKind { debug_assert!(cursor.prev() == '/' && cursor.peek() == '/'); cursor.bump(); @@ -397,7 +397,7 @@ fn line_comment(cursor: &mut Cursor) -> TokenKind { LineComment { doc_style } } -fn block_comment(cursor: &mut Cursor) -> TokenKind { +fn block_comment(cursor: &mut Cursor<'_>) -> TokenKind { debug_assert!(cursor.prev() == '/' && cursor.peek() == '*'); cursor.bump(); @@ -435,7 +435,7 @@ fn block_comment(cursor: &mut Cursor) -> TokenKind { } /// Start is already eaten, eat the rest of identifier. -pub(crate) fn ident(cursor: &mut Cursor) -> TokenKind { +pub(crate) fn ident(cursor: &mut Cursor<'_>) -> TokenKind { debug_assert!(is_id_start(cursor.prev())); cursor.bump_while(is_id_continue); Ident diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs index 5aa0bac31681e..33792622bec57 100644 --- a/compiler/rustc_lexer/src/literals.rs +++ b/compiler/rustc_lexer/src/literals.rs @@ -51,7 +51,7 @@ pub enum RawStrError { TooManyDelimiters { found: usize }, } -pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { +pub(crate) fn number(cursor: &mut Cursor<'_>, first_digit: char) -> LiteralKind { debug_assert!('0' <= cursor.prev() && cursor.prev() <= '9'); let mut base = Base::Decimal; if first_digit == '0' { @@ -120,7 +120,7 @@ pub(crate) fn number(cursor: &mut Cursor, first_digit: char) -> LiteralKind { } } -pub(crate) fn eat_decimal_digits(cursor: &mut Cursor) -> bool { +pub(crate) fn eat_decimal_digits(cursor: &mut Cursor<'_>) -> bool { let mut has_digits = false; loop { match cursor.peek() { @@ -137,7 +137,7 @@ pub(crate) fn eat_decimal_digits(cursor: &mut Cursor) -> bool { has_digits } -pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool { +pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor<'_>) -> bool { let mut has_digits = false; loop { match cursor.peek() { @@ -156,7 +156,7 @@ pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor) -> bool { /// Eats the float exponent. Returns true if at least one digit was met, /// and returns false otherwise. -fn eat_float_exponent(cursor: &mut Cursor) -> bool { +fn eat_float_exponent(cursor: &mut Cursor<'_>) -> bool { debug_assert!(cursor.prev() == 'e' || cursor.prev() == 'E'); if cursor.peek() == '-' || cursor.peek() == '+' { cursor.bump(); @@ -164,7 +164,7 @@ fn eat_float_exponent(cursor: &mut Cursor) -> bool { eat_decimal_digits(cursor) } -pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { +pub(crate) fn lifetime_or_char(cursor: &mut Cursor<'_>) -> TokenKind { debug_assert!(cursor.prev() == '\''); let can_be_a_lifetime = if cursor.peek_second() == '\'' { @@ -210,7 +210,7 @@ pub(crate) fn lifetime_or_char(cursor: &mut Cursor) -> TokenKind { } } -pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { +pub(crate) fn single_quoted_string(cursor: &mut Cursor<'_>) -> bool { debug_assert!(cursor.prev() == '\''); // Check if it's a one-symbol literal. if cursor.peek_second() == '\'' && cursor.peek() != '\\' { @@ -253,7 +253,7 @@ pub(crate) fn single_quoted_string(cursor: &mut Cursor) -> bool { /// Eats double-quoted string and returns true /// if string is terminated. -pub(crate) fn double_quoted_string(cursor: &mut Cursor) -> bool { +pub(crate) fn double_quoted_string(cursor: &mut Cursor<'_>) -> bool { debug_assert!(cursor.prev() == '"'); while let Some(c) = cursor.bump() { match c { @@ -273,7 +273,7 @@ pub(crate) fn double_quoted_string(cursor: &mut Cursor) -> bool { /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. pub(crate) fn raw_double_quoted_string( - cursor: &mut Cursor, + cursor: &mut Cursor<'_>, prefix_len: usize, ) -> (u16, Option) { // Wrap the actual function to handle the error with too many hashes. @@ -288,7 +288,10 @@ pub(crate) fn raw_double_quoted_string( } } -fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Option) { +fn raw_string_unvalidated( + cursor: &mut Cursor<'_>, + prefix_len: usize, +) -> (usize, Option) { debug_assert!(cursor.prev() == 'r'); let start_pos = cursor.len_consumed(); let mut possible_terminator_offset = None; @@ -354,7 +357,7 @@ fn raw_string_unvalidated(cursor: &mut Cursor, prefix_len: usize) -> (usize, Opt } /// Eats the suffix of a literal, e.g. "_u8". -pub(crate) fn eat_literal_suffix(cursor: &mut Cursor) { +pub(crate) fn eat_literal_suffix(cursor: &mut Cursor<'_>) { // Eats one identifier. if is_id_start(cursor.peek()) { cursor.bump();