diff --git a/compiler/rustc_lexer/src/cursor.rs b/compiler/rustc_lexer/src/cursor.rs index 297f3d19ca178..5110d7a109aaa 100644 --- a/compiler/rustc_lexer/src/cursor.rs +++ b/compiler/rustc_lexer/src/cursor.rs @@ -2,7 +2,7 @@ use std::str::Chars; /// Peekable iterator over a char sequence. /// -/// Next characters can be peeked via `nth_char` method, +/// Next characters can be peeked via `peek` method, /// and position can be shifted forward via `bump` method. pub(crate) struct Cursor<'a> { initial_len: usize, @@ -37,22 +37,17 @@ impl<'a> Cursor<'a> { } } - /// Returns nth character relative to the current cursor position. - /// If requested position doesn't exist, `EOF_CHAR` is returned. + /// Peeks the next symbol from the input stream without consuming it. + /// If it doesn't exist, `EOF_CHAR` is returned. /// However, getting `EOF_CHAR` doesn't always mean actual end of file, /// it should be checked with `is_eof` method. - fn nth_char(&self, n: usize) -> char { - self.chars().nth(n).unwrap_or(EOF_CHAR) - } - - /// Peeks the next symbol from the input stream without consuming it. - pub(crate) fn first(&self) -> char { - self.nth_char(0) + pub(crate) fn peek(&self) -> char { + self.chars.clone().nth(0).unwrap_or(EOF_CHAR) } /// Peeks the second symbol from the input stream without consuming it. - pub(crate) fn second(&self) -> char { - self.nth_char(1) + pub(crate) fn peek_second(&self) -> char { + self.chars.clone().nth(1).unwrap_or(EOF_CHAR) } /// Checks if there is nothing more to consume. @@ -65,11 +60,6 @@ impl<'a> Cursor<'a> { self.initial_len - self.chars.as_str().len() } - /// Returns a `Chars` iterator over the remaining characters. - fn chars(&self) -> Chars<'a> { - self.chars.clone() - } - /// Moves to the next character. pub(crate) fn bump(&mut self) -> Option { let c = self.chars.next()?; @@ -81,4 +71,11 @@ impl<'a> Cursor<'a> { Some(c) } + + /// Eats symbols while predicate returns true or until the end of file is reached. + pub(crate) fn bump_while(&mut self, mut predicate: impl FnMut(char) -> bool) { + while predicate(self.peek()) && !self.is_eof() { + self.bump(); + } + } } diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index 44fc4db7dc199..dacda8b3ca7a0 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -22,15 +22,20 @@ // `#![feature]` attributes should be added. mod cursor; +mod literals; pub mod unescape; #[cfg(test)] mod tests; -use self::LiteralKind::*; +pub use crate::literals::{Base, LiteralKind, RawStrError}; + use self::TokenKind::*; -use crate::cursor::{Cursor, EOF_CHAR}; -use std::convert::TryFrom; +use crate::cursor::Cursor; +use crate::literals::{ + double_quoted_string, eat_literal_suffix, lifetime_or_char, number, raw_double_quoted_string, + single_quoted_string, +}; /// Parsed token. /// It doesn't contain information about data that has been parsed, @@ -137,55 +142,6 @@ pub enum DocStyle { Inner, } -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum LiteralKind { - /// "12_u8", "0o100", "0b120i99" - Int { base: Base, empty_int: bool }, - /// "12.34f32", "0b100.100" - Float { base: Base, empty_exponent: bool }, - /// "'a'", "'\\'", "'''", "';" - Char { terminated: bool }, - /// "b'a'", "b'\\'", "b'''", "b';" - Byte { terminated: bool }, - /// ""abc"", ""abc" - Str { terminated: bool }, - /// "b"abc"", "b"abc" - ByteStr { terminated: bool }, - /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" - RawStr { n_hashes: u16, err: Option }, - /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" - RawByteStr { n_hashes: u16, err: Option }, -} - -/// Error produced validating a raw string. Represents cases like: -/// - `r##~"abcde"##`: `InvalidStarter` -/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` -/// - Too many `#`s (>65535): `TooManyDelimiters` -// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum RawStrError { - /// Non `#` characters exist between `r` and `"` eg. `r#~"..` - InvalidStarter { bad_char: char }, - /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they - /// may have intended to terminate it. - NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, - /// More than 65535 `#`s exist. - TooManyDelimiters { found: usize }, -} - -/// Base of numeric literal encoding according to its prefix. -#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] -pub enum Base { - /// Literal starts with "0b". - Binary, - /// Literal starts with "0o". - Octal, - /// Literal starts with "0x". - Hexadecimal, - /// Literal doesn't contain a prefix. - Decimal, -} - /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun", /// but shebang isn't a part of rust syntax. pub fn strip_shebang(input: &str) -> Option { @@ -211,12 +167,6 @@ pub fn strip_shebang(input: &str) -> Option { None } -/// Parses the first token from the provided input string. -pub fn first_token(input: &str) -> Token { - debug_assert!(!input.is_empty()); - Cursor::new(input).advance_token() -} - /// Creates an iterator that produces tokens from the input string. pub fn tokenize(mut input: &str) -> impl Iterator + '_ { std::iter::from_fn(move || { @@ -296,521 +246,197 @@ pub fn is_ident(string: &str) -> bool { } } -impl Cursor<'_> { - /// Parses a token from the input string. - fn advance_token(&mut self) -> Token { - let first_char = self.bump().unwrap(); - let token_kind = match first_char { - // Slash, comment or block comment. - '/' => match self.first() { - '/' => self.line_comment(), - '*' => self.block_comment(), - _ => Slash, - }, - - // Whitespace sequence. - c if is_whitespace(c) => self.whitespace(), - - // Raw identifier, raw string literal or identifier. - 'r' => match (self.first(), self.second()) { - ('#', c1) if is_id_start(c1) => self.raw_ident(), - ('#', _) | ('"', _) => { - let (n_hashes, err) = self.raw_double_quoted_string(1); - let suffix_start = self.len_consumed(); - if err.is_none() { - self.eat_literal_suffix(); - } - let kind = RawStr { n_hashes, err }; - Literal { kind, suffix_start } - } - _ => self.ident(), - }, - - // Byte literal, byte string literal, raw byte string literal or identifier. - 'b' => match (self.first(), self.second()) { - ('\'', _) => { - self.bump(); - let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); - } - let kind = Byte { terminated }; - Literal { kind, suffix_start } - } - ('"', _) => { - self.bump(); - let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); - } - let kind = ByteStr { terminated }; - Literal { kind, suffix_start } - } - ('r', '"') | ('r', '#') => { - self.bump(); - let (n_hashes, err) = self.raw_double_quoted_string(2); - let suffix_start = self.len_consumed(); - if err.is_none() { - self.eat_literal_suffix(); - } - let kind = RawByteStr { n_hashes, err }; - Literal { kind, suffix_start } - } - _ => self.ident(), - }, - - // Identifier (this should be checked after other variant that can - // start as identifier). - c if is_id_start(c) => self.ident(), - - // Numeric literal. - c @ '0'..='9' => { - let literal_kind = self.number(c); - let suffix_start = self.len_consumed(); - self.eat_literal_suffix(); - TokenKind::Literal { kind: literal_kind, suffix_start } - } +/// Parses the first token from the provided input string. +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + let cursor = &mut Cursor::new(input); + + let first_char = cursor.bump().unwrap(); + let token_kind = match first_char { + // Slash, comment or block comment. + '/' => match cursor.peek() { + '/' => line_comment(cursor), + '*' => block_comment(cursor), + _ => Slash, + }, + + // Whitespace sequence. + c if is_whitespace(c) => { + cursor.bump_while(is_whitespace); + Whitespace + } - // One-symbol tokens. - ';' => Semi, - ',' => Comma, - '.' => Dot, - '(' => OpenParen, - ')' => CloseParen, - '{' => OpenBrace, - '}' => CloseBrace, - '[' => OpenBracket, - ']' => CloseBracket, - '@' => At, - '#' => Pound, - '~' => Tilde, - '?' => Question, - ':' => Colon, - '$' => Dollar, - '=' => Eq, - '!' => Bang, - '<' => Lt, - '>' => Gt, - '-' => Minus, - '&' => And, - '|' => Or, - '+' => Plus, - '*' => Star, - '^' => Caret, - '%' => Percent, - - // Lifetime or character literal. - '\'' => self.lifetime_or_char(), - - // String literal. - '"' => { - let terminated = self.double_quoted_string(); - let suffix_start = self.len_consumed(); - if terminated { - self.eat_literal_suffix(); + // Raw identifier, raw string literal or identifier. + 'r' => match (cursor.peek(), cursor.peek_second()) { + ('#', c1) if is_id_start(c1) => { + // Eat "#" symbol. + cursor.bump(); + // Eat the identifier part of RawIdent. + cursor.bump(); + ident(cursor); + RawIdent + } + ('#', _) | ('"', _) => { + let (n_hashes, err) = raw_double_quoted_string(cursor, 1); + let suffix_start = cursor.len_consumed(); + if err.is_none() { + eat_literal_suffix(cursor); } - let kind = Str { terminated }; + let kind = LiteralKind::RawStr { n_hashes, err }; Literal { kind, suffix_start } } - _ => Unknown, - }; - Token::new(token_kind, self.len_consumed()) - } - - fn line_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.first() == '/'); - self.bump(); - - let doc_style = match self.first() { - // `//!` is an inner line doc comment. - '!' => Some(DocStyle::Inner), - // `////` (more than 3 slashes) is not considered a doc comment. - '/' if self.second() != '/' => Some(DocStyle::Outer), - _ => None, - }; - - self.eat_while(|c| c != '\n'); - LineComment { doc_style } - } - - fn block_comment(&mut self) -> TokenKind { - debug_assert!(self.prev() == '/' && self.first() == '*'); - self.bump(); - - let doc_style = match self.first() { - // `/*!` is an inner block doc comment. - '!' => Some(DocStyle::Inner), - // `/***` (more than 2 stars) is not considered a doc comment. - // `/**/` is not considered a doc comment. - '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer), - _ => None, - }; - - let mut depth = 1usize; - while let Some(c) = self.bump() { - match c { - '/' if self.first() == '*' => { - self.bump(); - depth += 1; - } - '*' if self.first() == '/' => { - self.bump(); - depth -= 1; - if depth == 0 { - // This block comment is closed, so for a construction like "/* */ */" - // there will be a successfully parsed block comment "/* */" - // and " */" will be processed separately. - break; - } + _ => ident(cursor), + }, + + // Byte literal, byte string literal, raw byte string literal or identifier. + 'b' => match (cursor.peek(), cursor.peek_second()) { + ('\'', _) => { + cursor.bump(); + let terminated = single_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + eat_literal_suffix(cursor); } - _ => (), + let kind = LiteralKind::Byte { terminated }; + Literal { kind, suffix_start } } - } - - BlockComment { doc_style, terminated: depth == 0 } - } - - fn whitespace(&mut self) -> TokenKind { - debug_assert!(is_whitespace(self.prev())); - self.eat_while(is_whitespace); - Whitespace - } - - fn raw_ident(&mut self) -> TokenKind { - debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second())); - // Eat "#" symbol. - self.bump(); - // Eat the identifier part of RawIdent. - self.eat_identifier(); - RawIdent - } - - fn ident(&mut self) -> TokenKind { - debug_assert!(is_id_start(self.prev())); - // Start is already eaten, eat the rest of identifier. - self.eat_while(is_id_continue); - Ident - } - - fn number(&mut self, first_digit: char) -> LiteralKind { - debug_assert!('0' <= self.prev() && self.prev() <= '9'); - let mut base = Base::Decimal; - if first_digit == '0' { - // Attempt to parse encoding base. - let has_digits = match self.first() { - 'b' => { - base = Base::Binary; - self.bump(); - self.eat_decimal_digits() - } - 'o' => { - base = Base::Octal; - self.bump(); - self.eat_decimal_digits() - } - 'x' => { - base = Base::Hexadecimal; - self.bump(); - self.eat_hexadecimal_digits() - } - // Not a base prefix. - '0'..='9' | '_' | '.' | 'e' | 'E' => { - self.eat_decimal_digits(); - true + ('"', _) => { + cursor.bump(); + let terminated = double_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + eat_literal_suffix(cursor); } - // Just a 0. - _ => return Int { base, empty_int: false }, - }; - // Base prefix was provided, but there were no digits - // after it, e.g. "0x". - if !has_digits { - return Int { base, empty_int: true }; + let kind = LiteralKind::ByteStr { terminated }; + Literal { kind, suffix_start } } - } else { - // No base prefix, parse number in the usual way. - self.eat_decimal_digits(); - }; - - match self.first() { - // Don't be greedy if this is actually an - // integer literal followed by field/method access or a range pattern - // (`0..2` and `12.foo()`) - '.' if self.second() != '.' && !is_id_start(self.second()) => { - // might have stuff after the ., and if it does, it needs to start - // with a number - self.bump(); - let mut empty_exponent = false; - if self.first().is_digit(10) { - self.eat_decimal_digits(); - match self.first() { - 'e' | 'E' => { - self.bump(); - empty_exponent = !self.eat_float_exponent(); - } - _ => (), - } + ('r', '"') | ('r', '#') => { + cursor.bump(); + let (n_hashes, err) = raw_double_quoted_string(cursor, 2); + let suffix_start = cursor.len_consumed(); + if err.is_none() { + eat_literal_suffix(cursor); } - Float { base, empty_exponent } - } - 'e' | 'E' => { - self.bump(); - let empty_exponent = !self.eat_float_exponent(); - Float { base, empty_exponent } + let kind = LiteralKind::RawByteStr { n_hashes, err }; + Literal { kind, suffix_start } } - _ => Int { base, empty_int: false }, + _ => ident(cursor), + }, + + // Identifier (this should be checked after other variant that can + // start as identifier). + c if is_id_start(c) => ident(cursor), + + // Numeric literal. + c @ '0'..='9' => { + let literal_kind = number(cursor, c); + let suffix_start = cursor.len_consumed(); + eat_literal_suffix(cursor); + TokenKind::Literal { kind: literal_kind, suffix_start } } - } - fn lifetime_or_char(&mut self) -> TokenKind { - debug_assert!(self.prev() == '\''); - - let can_be_a_lifetime = if self.second() == '\'' { - // It's surely not a lifetime. - false - } else { - // If the first symbol is valid for identifier, it can be a lifetime. - // Also check if it's a number for a better error reporting (so '0 will - // be reported as invalid lifetime and not as unterminated char literal). - is_id_start(self.first()) || self.first().is_digit(10) - }; - - if !can_be_a_lifetime { - let terminated = self.single_quoted_string(); - let suffix_start = self.len_consumed(); + // One-symbol tokens. + ';' => Semi, + ',' => Comma, + '.' => Dot, + '(' => OpenParen, + ')' => CloseParen, + '{' => OpenBrace, + '}' => CloseBrace, + '[' => OpenBracket, + ']' => CloseBracket, + '@' => At, + '#' => Pound, + '~' => Tilde, + '?' => Question, + ':' => Colon, + '$' => Dollar, + '=' => Eq, + '!' => Bang, + '<' => Lt, + '>' => Gt, + '-' => Minus, + '&' => And, + '|' => Or, + '+' => Plus, + '*' => Star, + '^' => Caret, + '%' => Percent, + + // Lifetime or character literal. + '\'' => lifetime_or_char(cursor), + + // String literal. + '"' => { + let terminated = double_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); if terminated { - self.eat_literal_suffix(); - } - let kind = Char { terminated }; - return Literal { kind, suffix_start }; - } - - // Either a lifetime or a character literal with - // length greater than 1. - - let starts_with_number = self.first().is_digit(10); - - // Skip the literal contents. - // First symbol can be a number (which isn't a valid identifier start), - // so skip it without any checks. - self.bump(); - self.eat_while(is_id_continue); - - // Check if after skipping literal contents we've met a closing - // single quote (which means that user attempted to create a - // string with single quotes). - if self.first() == '\'' { - self.bump(); - let kind = Char { terminated: true }; - Literal { kind, suffix_start: self.len_consumed() } - } else { - Lifetime { starts_with_number } - } - } - - fn single_quoted_string(&mut self) -> bool { - debug_assert!(self.prev() == '\''); - // Check if it's a one-symbol literal. - if self.second() == '\'' && self.first() != '\\' { - self.bump(); - self.bump(); - return true; - } - - // Literal has more than one symbol. - - // Parse until either quotes are terminated or error is detected. - loop { - match self.first() { - // Quotes are terminated, finish parsing. - '\'' => { - self.bump(); - return true; - } - // Probably beginning of the comment, which we don't want to include - // to the error report. - '/' => break, - // Newline without following '\'' means unclosed quote, stop parsing. - '\n' if self.second() != '\'' => break, - // End of file, stop parsing. - EOF_CHAR if self.is_eof() => break, - // Escaped slash is considered one character, so bump twice. - '\\' => { - self.bump(); - self.bump(); - } - // Skip the character. - _ => { - self.bump(); - } - } - } - // String was not terminated. - false - } - - /// Eats double-quoted string and returns true - /// if string is terminated. - fn double_quoted_string(&mut self) -> bool { - debug_assert!(self.prev() == '"'); - while let Some(c) = self.bump() { - match c { - '"' => { - return true; - } - '\\' if self.first() == '\\' || self.first() == '"' => { - // Bump again to skip escaped character. - self.bump(); - } - _ => (), - } - } - // End of file reached. - false - } - - /// Eats the double-quoted string and returns `n_hashes` and an error if encountered. - fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option) { - // Wrap the actual function to handle the error with too many hashes. - // This way, it eats the whole raw string. - let (n_hashes, err) = self.raw_string_unvalidated(prefix_len); - // Only up to 65535 `#`s are allowed in raw strings - match u16::try_from(n_hashes) { - Ok(num) => (num, err), - // We lie about the number of hashes here :P - Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), - } - } - - fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option) { - debug_assert!(self.prev() == 'r'); - let start_pos = self.len_consumed(); - let mut possible_terminator_offset = None; - let mut max_hashes = 0; - - // Count opening '#' symbols. - let mut eaten = 0; - while self.first() == '#' { - eaten += 1; - self.bump(); - } - let n_start_hashes = eaten; - - // Check that string is started. - match self.bump() { - Some('"') => (), - c => { - let c = c.unwrap_or(EOF_CHAR); - return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); + eat_literal_suffix(cursor); } + let kind = LiteralKind::Str { terminated }; + Literal { kind, suffix_start } } + _ => Unknown, + }; + Token::new(token_kind, cursor.len_consumed()) +} - // Skip the string contents and on each '#' character met, check if this is - // a raw string termination. - loop { - self.eat_while(|c| c != '"'); - - if self.is_eof() { - return ( - n_start_hashes, - Some(RawStrError::NoTerminator { - expected: n_start_hashes, - found: max_hashes, - possible_terminator_offset, - }), - ); - } +fn line_comment(cursor: &mut Cursor<'_>) -> TokenKind { + debug_assert!(cursor.prev() == '/' && cursor.peek() == '/'); + cursor.bump(); - // Eat closing double quote. - self.bump(); - - // Check that amount of closing '#' symbols - // is equal to the amount of opening ones. - // Note that this will not consume extra trailing `#` characters: - // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` - // followed by a `#` token. - let mut n_end_hashes = 0; - while self.first() == '#' && n_end_hashes < n_start_hashes { - n_end_hashes += 1; - self.bump(); - } + let doc_style = match cursor.peek() { + // `//!` is an inner line doc comment. + '!' => Some(DocStyle::Inner), + // `////` (more than 3 slashes) is not considered a doc comment. + '/' if cursor.peek_second() != '/' => Some(DocStyle::Outer), + _ => None, + }; - if n_end_hashes == n_start_hashes { - return (n_start_hashes, None); - } else if n_end_hashes > max_hashes { - // Keep track of possible terminators to give a hint about - // where there might be a missing terminator - possible_terminator_offset = - Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); - max_hashes = n_end_hashes; - } - } - } + cursor.bump_while(|c| c != '\n'); + LineComment { doc_style } +} - fn eat_decimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' => { - has_digits = true; - self.bump(); - } - _ => break, +fn block_comment(cursor: &mut Cursor<'_>) -> TokenKind { + debug_assert!(cursor.prev() == '/' && cursor.peek() == '*'); + cursor.bump(); + + let doc_style = match cursor.peek() { + // `/*!` is an inner block doc comment. + '!' => Some(DocStyle::Inner), + // `/***` (more than 2 stars) is not considered a doc comment. + // `/**/` is not considered a doc comment. + '*' if !matches!(cursor.peek_second(), '*' | '/') => Some(DocStyle::Outer), + _ => None, + }; + + let mut depth = 1usize; + while let Some(c) = cursor.bump() { + match c { + '/' if cursor.peek() == '*' => { + cursor.bump(); + depth += 1; } - } - has_digits - } - - fn eat_hexadecimal_digits(&mut self) -> bool { - let mut has_digits = false; - loop { - match self.first() { - '_' => { - self.bump(); - } - '0'..='9' | 'a'..='f' | 'A'..='F' => { - has_digits = true; - self.bump(); + '*' if cursor.peek() == '/' => { + cursor.bump(); + depth -= 1; + if depth == 0 { + // This block comment is closed, so for a construction like "/* */ */" + // there will be a successfully parsed block comment "/* */" + // and " */" will be processed separately. + break; } - _ => break, } + _ => (), } - has_digits - } - - /// Eats the float exponent. Returns true if at least one digit was met, - /// and returns false otherwise. - fn eat_float_exponent(&mut self) -> bool { - debug_assert!(self.prev() == 'e' || self.prev() == 'E'); - if self.first() == '-' || self.first() == '+' { - self.bump(); - } - self.eat_decimal_digits() - } - - // Eats the suffix of the literal, e.g. "_u8". - fn eat_literal_suffix(&mut self) { - self.eat_identifier(); } - // Eats the identifier. - fn eat_identifier(&mut self) { - if !is_id_start(self.first()) { - return; - } - self.bump(); - - self.eat_while(is_id_continue); - } + BlockComment { doc_style, terminated: depth == 0 } +} - /// Eats symbols while predicate returns true or until the end of file is reached. - fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) { - while predicate(self.first()) && !self.is_eof() { - self.bump(); - } - } +/// Start is already eaten, eat the rest of identifier. +pub(crate) fn ident(cursor: &mut Cursor<'_>) -> TokenKind { + debug_assert!(is_id_start(cursor.prev())); + cursor.bump_while(is_id_continue); + Ident } diff --git a/compiler/rustc_lexer/src/literals.rs b/compiler/rustc_lexer/src/literals.rs new file mode 100644 index 0000000000000..33792622bec57 --- /dev/null +++ b/compiler/rustc_lexer/src/literals.rs @@ -0,0 +1,366 @@ +use crate::cursor::{Cursor, EOF_CHAR}; +use crate::{ident, is_id_continue, is_id_start, TokenKind}; +use std::convert::TryFrom; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + /// "12_u8", "0o100", "0b120i99" + Int { base: Base, empty_int: bool }, + /// "12.34f32", "0b100.100" + Float { base: Base, empty_exponent: bool }, + /// "'a'", "'\\'", "'''", "';" + Char { terminated: bool }, + /// "b'a'", "b'\\'", "b'''", "b';" + Byte { terminated: bool }, + /// ""abc"", ""abc" + Str { terminated: bool }, + /// "b"abc"", "b"abc" + ByteStr { terminated: bool }, + /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a" + RawStr { n_hashes: u16, err: Option }, + /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a" + RawByteStr { n_hashes: u16, err: Option }, +} + +/// Base of numeric literal encoding according to its prefix. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + /// Literal starts with "0b". + Binary, + /// Literal starts with "0o". + Octal, + /// Literal starts with "0x". + Hexadecimal, + /// Literal doesn't contain a prefix. + Decimal, +} + +/// Error produced validating a raw string. Represents cases like: +/// - `r##~"abcde"##`: `InvalidStarter` +/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)` +/// - Too many `#`s (>65535): `TooManyDelimiters` +// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629 +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum RawStrError { + /// Non `#` characters exist between `r` and `"` eg. `r#~"..` + InvalidStarter { bad_char: char }, + /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they + /// may have intended to terminate it. + NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option }, + /// More than 65535 `#`s exist. + TooManyDelimiters { found: usize }, +} + +pub(crate) fn number(cursor: &mut Cursor<'_>, first_digit: char) -> LiteralKind { + debug_assert!('0' <= cursor.prev() && cursor.prev() <= '9'); + let mut base = Base::Decimal; + if first_digit == '0' { + // Attempt to parse encoding base. + let has_digits = match cursor.peek() { + 'b' => { + base = Base::Binary; + cursor.bump(); + eat_decimal_digits(cursor) + } + 'o' => { + base = Base::Octal; + cursor.bump(); + eat_decimal_digits(cursor) + } + 'x' => { + base = Base::Hexadecimal; + cursor.bump(); + eat_hexadecimal_digits(cursor) + } + // Not a base prefix. + '0'..='9' | '_' | '.' | 'e' | 'E' => { + eat_decimal_digits(cursor); + true + } + // Just a 0. + _ => return LiteralKind::Int { base, empty_int: false }, + }; + // Base prefix was provided, but there were no digits + // after it, e.g. "0x". + if !has_digits { + return LiteralKind::Int { base, empty_int: true }; + } + } else { + // No base prefix, parse number in the usual way. + eat_decimal_digits(cursor); + }; + + match cursor.peek() { + // Don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + '.' if cursor.peek_second() != '.' && !is_id_start(cursor.peek_second()) => { + // might have stuff after the ., and if it does, it needs to start + // with a number + cursor.bump(); + let mut empty_exponent = false; + if cursor.peek().is_digit(10) { + eat_decimal_digits(cursor); + match cursor.peek() { + 'e' | 'E' => { + cursor.bump(); + empty_exponent = !eat_float_exponent(cursor); + } + _ => (), + } + } + LiteralKind::Float { base, empty_exponent } + } + 'e' | 'E' => { + cursor.bump(); + let empty_exponent = !eat_float_exponent(cursor); + LiteralKind::Float { base, empty_exponent } + } + _ => LiteralKind::Int { base, empty_int: false }, + } +} + +pub(crate) fn eat_decimal_digits(cursor: &mut Cursor<'_>) -> bool { + let mut has_digits = false; + loop { + match cursor.peek() { + '_' => { + cursor.bump(); + } + '0'..='9' => { + has_digits = true; + cursor.bump(); + } + _ => break, + } + } + has_digits +} + +pub(crate) fn eat_hexadecimal_digits(cursor: &mut Cursor<'_>) -> bool { + let mut has_digits = false; + loop { + match cursor.peek() { + '_' => { + cursor.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + cursor.bump(); + } + _ => break, + } + } + has_digits +} + +/// Eats the float exponent. Returns true if at least one digit was met, +/// and returns false otherwise. +fn eat_float_exponent(cursor: &mut Cursor<'_>) -> bool { + debug_assert!(cursor.prev() == 'e' || cursor.prev() == 'E'); + if cursor.peek() == '-' || cursor.peek() == '+' { + cursor.bump(); + } + eat_decimal_digits(cursor) +} + +pub(crate) fn lifetime_or_char(cursor: &mut Cursor<'_>) -> TokenKind { + debug_assert!(cursor.prev() == '\''); + + let can_be_a_lifetime = if cursor.peek_second() == '\'' { + // It's surely not a lifetime. + false + } else { + // If the first symbol is valid for identifier, it can be a lifetime. + // Also check if it's a number for a better error reporting (so '0 will + // be reported as invalid lifetime and not as unterminated char literal). + is_id_start(cursor.peek()) || cursor.peek().is_digit(10) + }; + + if !can_be_a_lifetime { + let terminated = single_quoted_string(cursor); + let suffix_start = cursor.len_consumed(); + if terminated { + eat_literal_suffix(cursor); + } + let kind = LiteralKind::Char { terminated }; + return TokenKind::Literal { kind, suffix_start }; + } + + // Either a lifetime or a character literal with + // length greater than 1. + + let starts_with_number = cursor.peek().is_digit(10); + + // Skip the literal contents. + // First symbol can be a number (which isn't a valid identifier start), + // so skip it without any checks. + cursor.bump(); + cursor.bump_while(is_id_continue); + + // Check if after skipping literal contents we've met a closing + // single quote (which means that user attempted to create a + // string with single quotes). + if cursor.peek() == '\'' { + cursor.bump(); + let kind = LiteralKind::Char { terminated: true }; + TokenKind::Literal { kind, suffix_start: cursor.len_consumed() } + } else { + TokenKind::Lifetime { starts_with_number } + } +} + +pub(crate) fn single_quoted_string(cursor: &mut Cursor<'_>) -> bool { + debug_assert!(cursor.prev() == '\''); + // Check if it's a one-symbol literal. + if cursor.peek_second() == '\'' && cursor.peek() != '\\' { + cursor.bump(); + cursor.bump(); + return true; + } + + // Literal has more than one symbol. + + // Parse until either quotes are terminated or error is detected. + loop { + match cursor.peek() { + // Quotes are terminated, finish parsing. + '\'' => { + cursor.bump(); + return true; + } + // Probably beginning of the comment, which we don't want to include + // to the error report. + '/' => break, + // Newline without following '\'' means unclosed quote, stop parsing. + '\n' if cursor.peek_second() != '\'' => break, + // End of file, stop parsing. + EOF_CHAR if cursor.is_eof() => break, + // Escaped slash is considered one character, so bump twice. + '\\' => { + cursor.bump(); + cursor.bump(); + } + // Skip the character. + _ => { + cursor.bump(); + } + } + } + // String was not terminated. + false +} + +/// Eats double-quoted string and returns true +/// if string is terminated. +pub(crate) fn double_quoted_string(cursor: &mut Cursor<'_>) -> bool { + debug_assert!(cursor.prev() == '"'); + while let Some(c) = cursor.bump() { + match c { + '"' => { + return true; + } + '\\' if cursor.peek() == '\\' || cursor.peek() == '"' => { + // Bump again to skip escaped character. + cursor.bump(); + } + _ => (), + } + } + // End of file reached. + false +} + +/// Eats the double-quoted string and returns `n_hashes` and an error if encountered. +pub(crate) fn raw_double_quoted_string( + cursor: &mut Cursor<'_>, + prefix_len: usize, +) -> (u16, Option) { + // Wrap the actual function to handle the error with too many hashes. + // This way, it eats the whole raw string. + let (n_hashes, err) = raw_string_unvalidated(cursor, prefix_len); + + // Only up to 65535 `#`s are allowed in raw strings. + match u16::try_from(n_hashes) { + Ok(num) => (num, err), + // We lie about the number of hashes here :P + Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })), + } +} + +fn raw_string_unvalidated( + cursor: &mut Cursor<'_>, + prefix_len: usize, +) -> (usize, Option) { + debug_assert!(cursor.prev() == 'r'); + let start_pos = cursor.len_consumed(); + let mut possible_terminator_offset = None; + let mut max_hashes = 0; + + // Count opening '#' symbols. + let mut eaten = 0; + while cursor.peek() == '#' { + eaten += 1; + cursor.bump(); + } + let n_start_hashes = eaten; + + // Check that string is started. + match cursor.bump() { + Some('"') => (), + c => { + let c = c.unwrap_or(EOF_CHAR); + return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c })); + } + } + + // Skip the string contents and on each '#' character met, check if this is + // a raw string termination. + loop { + cursor.bump_while(|c| c != '"'); + + if cursor.is_eof() { + return ( + n_start_hashes, + Some(RawStrError::NoTerminator { + expected: n_start_hashes, + found: max_hashes, + possible_terminator_offset, + }), + ); + } + + // Eat closing double quote. + cursor.bump(); + + // Check that amount of closing '#' symbols + // is equal to the amount of opening ones. + // Note that this will not consume extra trailing `#` characters: + // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }` + // followed by a `#` token. + let mut n_end_hashes = 0; + while cursor.peek() == '#' && n_end_hashes < n_start_hashes { + n_end_hashes += 1; + cursor.bump(); + } + + if n_end_hashes == n_start_hashes { + return (n_start_hashes, None); + } else if n_end_hashes > max_hashes { + // Keep track of possible terminators to give a hint about + // where there might be a missing terminator + possible_terminator_offset = + Some(cursor.len_consumed() - start_pos - n_end_hashes + prefix_len); + max_hashes = n_end_hashes; + } + } +} + +/// Eats the suffix of a literal, e.g. "_u8". +pub(crate) fn eat_literal_suffix(cursor: &mut Cursor<'_>) { + // Eats one identifier. + if is_id_start(cursor.peek()) { + cursor.bump(); + ident(cursor); + } +} diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs index 94017b7b286e2..8f1a968526609 100644 --- a/compiler/rustc_lexer/src/tests.rs +++ b/compiler/rustc_lexer/src/tests.rs @@ -1,12 +1,13 @@ -use super::*; - +use crate::cursor::Cursor; +use crate::literals::{raw_double_quoted_string, RawStrError}; +use crate::{strip_shebang, tokenize}; use expect_test::{expect, Expect}; fn check_raw_str(s: &str, expected_hashes: u16, expected_err: Option) { let s = &format!("r{}", s); - let mut cursor = Cursor::new(s); + let cursor = &mut Cursor::new(s); cursor.bump(); - let (n_hashes, err) = cursor.raw_double_quoted_string(0); + let (n_hashes, err) = raw_double_quoted_string(cursor, 0); assert_eq!(n_hashes, expected_hashes); assert_eq!(err, expected_err); }