From 5b17a5e30a049b11f4bef644d2f65b748ce93c39 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 13 May 2019 11:41:24 +0200 Subject: [PATCH 01/11] Clean up minor bits --- src/libsyntax/parse/lexer/mod.rs | 2 +- src/libsyntax/parse/unescape.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index e3d959c2c54c4..406e90243b33a 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -130,7 +130,7 @@ impl<'a> StringReader<'a> { self.ch.is_none() } - fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) { + fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) -> ! { let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string"); err.span_label(self.mk_sp(pos, pos), "unterminated raw string"); diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index 90ee549db017b..55a628d411e4b 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -1,4 +1,4 @@ -//! Utilities for validating string and char literals and turning them into +//! Utilities for validating string and char literals and turning them into //! values they represent. use std::str::Chars; From b8e3533b732326da3f50ca35fb033cba66150405 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 13 May 2019 11:42:12 +0200 Subject: [PATCH 02/11] Separate a `scan_raw_string` (similar `raw_byte` variant) --- src/libsyntax/parse/lexer/mod.rs | 159 ++++++++++++++++--------------- 1 file changed, 82 insertions(+), 77 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 406e90243b33a..c42f694f50a00 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1086,82 +1086,10 @@ impl<'a> StringReader<'a> { Ok(TokenKind::lit(token::Str, symbol, suffix)) } 'r' => { - let start_bpos = self.pos; - self.bump(); - let mut hash_count: u16 = 0; - while self.ch_is('#') { - if hash_count == 65535 { - let bpos = self.next_pos; - self.fatal_span_(start_bpos, - bpos, - "too many `#` symbols: raw strings may be \ - delimited by up to 65535 `#` symbols").raise(); - } - self.bump(); - hash_count += 1; - } - - if self.is_eof() { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } else if !self.ch_is('"') { - let last_bpos = self.pos; - let curr_char = self.ch.unwrap(); - self.fatal_span_char(start_bpos, - last_bpos, - "found invalid character; only `#` is allowed \ - in raw string delimitation", - curr_char).raise(); - } - self.bump(); - let content_start_bpos = self.pos; - let mut content_end_bpos; - let mut valid = true; - 'outer: loop { - if self.is_eof() { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } - // if self.ch_is('"') { - // content_end_bpos = self.pos; - // for _ in 0..hash_count { - // self.bump(); - // if !self.ch_is('#') { - // continue 'outer; - let c = self.ch.unwrap(); - match c { - '"' => { - content_end_bpos = self.pos; - for _ in 0..hash_count { - self.bump(); - if !self.ch_is('#') { - continue 'outer; - } - } - break; - } - '\r' => { - if !self.nextch_is('\n') { - let last_bpos = self.pos; - self.err_span_(start_bpos, - last_bpos, - "bare CR not allowed in raw string, use \\r \ - instead"); - valid = false; - } - } - _ => (), - } - self.bump(); - } - - self.bump(); - let symbol = if valid { - self.name_from_to(content_start_bpos, content_end_bpos) - } else { - Symbol::intern("??") - }; + let (kind, symbol) = self.scan_raw_string(); let suffix = self.scan_optional_raw_name(); - Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix)) + Ok(TokenKind::lit(kind, symbol, suffix)) } '-' => { if self.nextch_is('>') { @@ -1315,6 +1243,83 @@ impl<'a> StringReader<'a> { id } + fn scan_raw_string(&mut self) -> (token::LitKind, Symbol) { + let start_bpos = self.pos; + self.bump(); + let mut hash_count: u16 = 0; + while self.ch_is('#') { + if hash_count == 65535 { + let bpos = self.next_pos; + self.fatal_span_(start_bpos, + bpos, + "too many `#` symbols: raw strings may be \ + delimited by up to 65535 `#` symbols").raise(); + } + self.bump(); + hash_count += 1; + } + + if self.is_eof() { + self.fail_unterminated_raw_string(start_bpos, hash_count); + } else if !self.ch_is('"') { + let last_bpos = self.pos; + let curr_char = self.ch.unwrap(); + self.fatal_span_char(start_bpos, + last_bpos, + "found invalid character; only `#` is allowed \ + in raw string delimitation", + curr_char).raise(); + } + self.bump(); + let content_start_bpos = self.pos; + let mut content_end_bpos; + let mut valid = true; + 'outer: loop { + // if self.ch_is('"') { + // content_end_bpos = self.pos; + // for _ in 0..hash_count { + // self.bump(); + // if !self.ch_is('#') { + // continue 'outer; + match self.ch { + None => { + self.fail_unterminated_raw_string(start_bpos, hash_count); + } + Some('"') => { + content_end_bpos = self.pos; + for _ in 0..hash_count { + self.bump(); + if !self.ch_is('#') { + continue 'outer; + } + } + break; + } + Some(c) => { + if c == '\r' && !self.nextch_is('\n') { + let last_bpos = self.pos; + self.err_span_(start_bpos, + last_bpos, + "bare CR not allowed in raw string, use \\r \ + instead"); + valid = false; + } + } + } + self.bump(); + } + + self.bump(); + + let symbol = if valid { + self.name_from_to(content_start_bpos, content_end_bpos) + } else { + Symbol::intern("??") + }; + + (token::StrRaw(hash_count), symbol) + } + fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) { let start_bpos = self.pos; self.bump(); @@ -1324,7 +1329,7 @@ impl<'a> StringReader<'a> { let bpos = self.next_pos; self.fatal_span_(start_bpos, bpos, - "too many `#` symbols: raw byte strings may be \ + "too many `#` symbols: raw strings may be \ delimited by up to 65535 `#` symbols").raise(); } self.bump(); @@ -1334,8 +1339,8 @@ impl<'a> StringReader<'a> { if self.is_eof() { self.fail_unterminated_raw_string(start_bpos, hash_count); } else if !self.ch_is('"') { - let pos = self.pos; - let ch = self.ch.unwrap(); + let last_bpos = self.pos; + let curr_char = self.ch.unwrap(); self.fatal_span_char(start_bpos, pos, "found invalid character; only `#` is allowed in raw \ From 08ede49dcb0ae9a085f1cb8ccf6bc0ba682c83e7 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 13 May 2019 12:07:43 +0200 Subject: [PATCH 03/11] Remove redundant, commented out code It was commented out as part of https://github.com/rust-lang/rust/commit/8a8e497ae786ffc032c1e68fc23da0edcf6fa5e3. Done probably by accident, since the code in question was moved to a match arm, along with newly introduced logic to detect bare CRs in raw strings. --- src/libsyntax/parse/lexer/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index c42f694f50a00..02ef94fe9adba 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1275,12 +1275,6 @@ impl<'a> StringReader<'a> { let mut content_end_bpos; let mut valid = true; 'outer: loop { - // if self.ch_is('"') { - // content_end_bpos = self.pos; - // for _ in 0..hash_count { - // self.bump(); - // if !self.ch_is('#') { - // continue 'outer; match self.ch { None => { self.fail_unterminated_raw_string(start_bpos, hash_count); From cab7e7fe76c3c881078f068a8da4a863efdd2c77 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 13 May 2019 19:52:55 +0200 Subject: [PATCH 04/11] Validate and transcribe raw strings via unescape module --- src/libsyntax/parse/lexer/mod.rs | 45 ++++++++++--------- src/libsyntax/parse/literal.rs | 38 ++++++---------- src/libsyntax/parse/unescape.rs | 22 +++++++++ .../lex-bare-cr-string-literal-doc-comment.rs | 2 +- ...-bare-cr-string-literal-doc-comment.stderr | 6 +-- 5 files changed, 63 insertions(+), 50 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 02ef94fe9adba..e3830b1e3b668 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1086,10 +1086,12 @@ impl<'a> StringReader<'a> { Ok(TokenKind::lit(token::Str, symbol, suffix)) } 'r' => { - let (kind, symbol) = self.scan_raw_string(); + let (start, end, hash_count) = self.scan_raw_string(); + let symbol = self.name_from_to(start, end); + self.validate_raw_str_escape(start, end); let suffix = self.scan_optional_raw_name(); - Ok(TokenKind::lit(kind, symbol, suffix)) + Ok(TokenKind::lit(token::StrRaw(hash_count), symbol, suffix)) } '-' => { if self.nextch_is('>') { @@ -1243,7 +1245,7 @@ impl<'a> StringReader<'a> { id } - fn scan_raw_string(&mut self) -> (token::LitKind, Symbol) { + fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) { let start_bpos = self.pos; self.bump(); let mut hash_count: u16 = 0; @@ -1273,7 +1275,6 @@ impl<'a> StringReader<'a> { self.bump(); let content_start_bpos = self.pos; let mut content_end_bpos; - let mut valid = true; 'outer: loop { match self.ch { None => { @@ -1289,29 +1290,14 @@ impl<'a> StringReader<'a> { } break; } - Some(c) => { - if c == '\r' && !self.nextch_is('\n') { - let last_bpos = self.pos; - self.err_span_(start_bpos, - last_bpos, - "bare CR not allowed in raw string, use \\r \ - instead"); - valid = false; - } - } + _ => (), } self.bump(); } self.bump(); - let symbol = if valid { - self.name_from_to(content_start_bpos, content_end_bpos) - } else { - Symbol::intern("??") - }; - - (token::StrRaw(hash_count), symbol) + (content_start_bpos, content_end_bpos, hash_count) } fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) { @@ -1421,6 +1407,23 @@ impl<'a> StringReader<'a> { }); } + fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) { + self.with_str_from_to(content_start, content_end, |lit: &str| { + unescape::unescape_raw_str(lit, &mut |range, c| { + if let Err(err) = c { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), + unescape::Mode::Str, + range, + err, + ) + } + }) + }); + } + fn validate_byte_str_escape(&self, start_with_quote: BytePos) { self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { unescape::unescape_byte_str(lit, &mut |range, c| { diff --git a/src/libsyntax/parse/literal.rs b/src/libsyntax/parse/literal.rs index 7d5356ffe4d8d..3a2d905585c0e 100644 --- a/src/libsyntax/parse/literal.rs +++ b/src/libsyntax/parse/literal.rs @@ -4,7 +4,8 @@ use crate::ast::{self, Lit, LitKind}; use crate::parse::parser::Parser; use crate::parse::PResult; use crate::parse::token::{self, Token, TokenKind}; -use crate::parse::unescape::{unescape_str, unescape_char, unescape_byte_str, unescape_byte}; +use crate::parse::unescape::{unescape_str, unescape_byte_str, unescape_raw_str}; +use crate::parse::unescape::{unescape_char, unescape_byte}; use crate::print::pprust; use crate::symbol::{kw, sym, Symbol}; use crate::tokenstream::{TokenStream, TokenTree}; @@ -141,7 +142,17 @@ impl LitKind { // Ditto. let s = symbol.as_str(); let symbol = if s.contains('\r') { - Symbol::intern(&raw_str_lit(&s)) + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_raw_str(&s, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + Symbol::intern(&buf) } else { symbol }; @@ -350,29 +361,6 @@ crate fn expect_no_suffix(diag: &Handler, sp: Span, kind: &str, suffix: Option String { - debug!("raw_str_lit: {:?}", lit); - let mut res = String::with_capacity(lit.len()); - - let mut chars = lit.chars().peekable(); - while let Some(c) = chars.next() { - if c == '\r' { - if *chars.peek().unwrap() != '\n' { - panic!("lexer accepted bare CR"); - } - chars.next(); - res.push('\n'); - } else { - res.push(c); - } - } - - res.shrink_to_fit(); - res -} - // Checks if `s` looks like i32 or u1234 etc. fn looks_like_width_suffix(first_chars: &[char], s: &str) -> bool { s.len() > 1 && s.starts_with(first_chars) && s[1..].chars().all(|c| c.is_ascii_digit()) diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index 55a628d411e4b..d6b7db16305bb 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -66,6 +66,28 @@ where }) } +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub(crate) fn unescape_raw_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut byte_offset: usize = 0; + + let mut chars = literal_text.chars().peekable(); + while let Some(curr) = chars.next() { + let result = match (curr, chars.peek()) { + ('\r', Some('\n')) => Ok(curr), + ('\r', _) => Err(EscapeError::BareCarriageReturn), + _ => Ok(curr), + }; + callback(byte_offset..(byte_offset + curr.len_utf8()), result); + byte_offset += curr.len_utf8(); + } +} + #[derive(Debug, Clone, Copy)] pub(crate) enum Mode { Char, diff --git a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs index b588b007ae929..ed5df42f9dd4e 100644 --- a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs +++ b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs @@ -21,7 +21,7 @@ fn main() { let _s = "foo bar"; //~ ERROR: bare CR not allowed in string // the following string literal has a bare CR in it - let _s = r"bar foo"; //~ ERROR: bare CR not allowed in raw string + let _s = r"bar foo"; //~ ERROR: bare CR not allowed in string // the following string literal has a bare CR in it let _s = "foo\ bar"; //~ ERROR: unknown character escape: \r diff --git a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr index 7d944569ca9c4..153237a7f71b4 100644 --- a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr +++ b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr @@ -28,11 +28,11 @@ error: bare CR not allowed in string, use \r instead LL | let _s = "foo bar"; | ^ -error: bare CR not allowed in raw string, use \r instead - --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:14 +error: bare CR not allowed in string, use \r instead + --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:19 | LL | let _s = r"bar foo"; - | ^^^^^ + | ^ error: unknown character escape: \r --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:27:19 From 49d62e8d5a9df16e8ed6c703031fb72d264e3469 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 13 May 2019 20:21:44 +0200 Subject: [PATCH 05/11] Prohibit bare CRs in raw byte strings --- src/libsyntax/parse/lexer/mod.rs | 94 +++++-------------- src/libsyntax/parse/unescape.rs | 24 +++++ .../parse/unescape_error_reporting.rs | 5 + .../ui/parser/raw-byte-string-literals.rs | 3 + .../ui/parser/raw-byte-string-literals.stderr | 14 ++- 5 files changed, 66 insertions(+), 74 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index e3830b1e3b668..685c17d104bbb 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -292,15 +292,6 @@ impl<'a> StringReader<'a> { self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..]) } - /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an - /// escaped character to the error message - fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) { - let mut m = m.to_string(); - m.push_str(": "); - push_escaped_char(&mut m, c); - self.err_span_(from_pos, to_pos, &m[..]); - } - /// Advance peek_token to refer to the next token, and /// possibly update the interner. fn advance_token(&mut self) -> Result<(), ()> { @@ -1070,7 +1061,13 @@ impl<'a> StringReader<'a> { self.validate_byte_str_escape(start_with_quote); (token::ByteStr, symbol) }, - Some('r') => self.scan_raw_byte_string(), + Some('r') => { + let (start, end, hash_count) = self.scan_raw_string(); + let symbol = self.name_from_to(start, end); + self.validate_raw_byte_str_escape(start, end); + + (token::ByteStrRaw(hash_count), symbol) + } _ => unreachable!(), // Should have been a token::Ident above. }; let suffix = self.scan_optional_raw_name(); @@ -1300,66 +1297,6 @@ impl<'a> StringReader<'a> { (content_start_bpos, content_end_bpos, hash_count) } - fn scan_raw_byte_string(&mut self) -> (token::LitKind, Symbol) { - let start_bpos = self.pos; - self.bump(); - let mut hash_count = 0; - while self.ch_is('#') { - if hash_count == 65535 { - let bpos = self.next_pos; - self.fatal_span_(start_bpos, - bpos, - "too many `#` symbols: raw strings may be \ - delimited by up to 65535 `#` symbols").raise(); - } - self.bump(); - hash_count += 1; - } - - if self.is_eof() { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } else if !self.ch_is('"') { - let last_bpos = self.pos; - let curr_char = self.ch.unwrap(); - self.fatal_span_char(start_bpos, - pos, - "found invalid character; only `#` is allowed in raw \ - string delimitation", - ch).raise(); - } - self.bump(); - let content_start_bpos = self.pos; - let mut content_end_bpos; - 'outer: loop { - match self.ch { - None => { - self.fail_unterminated_raw_string(start_bpos, hash_count); - } - Some('"') => { - content_end_bpos = self.pos; - for _ in 0..hash_count { - self.bump(); - if !self.ch_is('#') { - continue 'outer; - } - } - break; - } - Some(c) => { - if c > '\x7F' { - let pos = self.pos; - self.err_span_char(pos, pos, "raw byte string must be ASCII", c); - } - } - } - self.bump(); - } - - self.bump(); - - (token::ByteStrRaw(hash_count), self.name_from_to(content_start_bpos, content_end_bpos)) - } - fn validate_char_escape(&self, start_with_quote: BytePos) { self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { if let Err((off, err)) = unescape::unescape_char(lit) { @@ -1424,6 +1361,23 @@ impl<'a> StringReader<'a> { }); } + fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { + self.with_str_from_to(content_start, content_end, |lit: &str| { + unescape::unescape_raw_byte_str(lit, &mut |range, c| { + if let Err(err) = c { + emit_unescape_error( + &self.sess.span_diagnostic, + lit, + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), + unescape::Mode::ByteStr, + range, + err, + ) + } + }) + }); + } + fn validate_byte_str_escape(&self, start_with_quote: BytePos) { self.with_str_from_to(start_with_quote + BytePos(1), self.pos - BytePos(1), |lit| { unescape::unescape_byte_str(lit, &mut |range, c| { diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index d6b7db16305bb..819463b547250 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -29,6 +29,7 @@ pub(crate) enum EscapeError { UnicodeEscapeInByte, NonAsciiCharInByte, + NonAsciiCharInByteString, } /// Takes a contents of a char literal (without quotes), and returns an @@ -88,6 +89,29 @@ where } } +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub(crate) fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut byte_offset: usize = 0; + + let mut chars = literal_text.chars().peekable(); + while let Some(curr) = chars.next() { + let result = match (curr, chars.peek()) { + ('\r', Some('\n')) => Ok(curr), + ('\r', _) => Err(EscapeError::BareCarriageReturn), + (c, _) if c > '\x7F' => Err(EscapeError::NonAsciiCharInByteString), + _ => Ok(curr), + }; + callback(byte_offset..(byte_offset + curr.len_utf8()), result); + byte_offset += curr.len_utf8(); + } +} + #[derive(Debug, Clone, Copy)] pub(crate) enum Mode { Char, diff --git a/src/libsyntax/parse/unescape_error_reporting.rs b/src/libsyntax/parse/unescape_error_reporting.rs index 22777c0884f47..8f152974a6d3f 100644 --- a/src/libsyntax/parse/unescape_error_reporting.rs +++ b/src/libsyntax/parse/unescape_error_reporting.rs @@ -124,6 +124,11 @@ pub(crate) fn emit_unescape_error( handler.span_err(span, "byte constant must be ASCII. \ Use a \\xHH escape for a non-ASCII byte") } + EscapeError::NonAsciiCharInByteString => { + assert!(mode.is_bytes()); + let (_c, span) = last_char(); + handler.span_err(span, "raw byte string must be ASCII") + } EscapeError::OutOfRangeHexEscape => { handler.span_err(span, "this form of character escape may only be used \ with characters in the range [\\x00-\\x7f]") diff --git a/src/test/ui/parser/raw-byte-string-literals.rs b/src/test/ui/parser/raw-byte-string-literals.rs index 3b50fb8036ada..87ecfb5c5445c 100644 --- a/src/test/ui/parser/raw-byte-string-literals.rs +++ b/src/test/ui/parser/raw-byte-string-literals.rs @@ -1,4 +1,7 @@ +// ignore-tidy-cr +// compile-flags: -Z continue-parse-after-error pub fn main() { + br"a "; //~ ERROR bare CR not allowed in string br"é"; //~ ERROR raw byte string must be ASCII br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/src/test/ui/parser/raw-byte-string-literals.stderr b/src/test/ui/parser/raw-byte-string-literals.stderr index 671ed97d1b52a..03fe79722b844 100644 --- a/src/test/ui/parser/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw-byte-string-literals.stderr @@ -1,14 +1,20 @@ -error: raw byte string must be ASCII: \u{e9} - --> $DIR/raw-byte-string-literals.rs:2:8 +error: bare CR not allowed in string, use \r instead + --> $DIR/raw-byte-string-literals.rs:4:9 + | +LL | br"a "; + | ^ + +error: raw byte string must be ASCII + --> $DIR/raw-byte-string-literals.rs:5:8 | LL | br"é"; | ^ error: found invalid character; only `#` is allowed in raw string delimitation: ~ - --> $DIR/raw-byte-string-literals.rs:3:6 + --> $DIR/raw-byte-string-literals.rs:6:6 | LL | br##~"a"~##; | ^^^ -error: aborting due to 2 previous errors +error: aborting due to 3 previous errors From d4632744fa0fb18a7c3f5058f1e8157c760353b4 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Sun, 9 Jun 2019 00:33:21 +0200 Subject: [PATCH 06/11] Translate CRLF -> LF in raw (byte) strings --- src/libsyntax/parse/lexer/mod.rs | 4 +-- src/libsyntax/parse/literal.rs | 4 +-- src/libsyntax/parse/unescape.rs | 44 ++++++++++---------------------- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 685c17d104bbb..71fa4bdb2cf0e 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1346,7 +1346,7 @@ impl<'a> StringReader<'a> { fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) { self.with_str_from_to(content_start, content_end, |lit: &str| { - unescape::unescape_raw_str(lit, &mut |range, c| { + unescape::unescape_raw_str(lit, unescape::Mode::Str, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, @@ -1363,7 +1363,7 @@ impl<'a> StringReader<'a> { fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { self.with_str_from_to(content_start, content_end, |lit: &str| { - unescape::unescape_raw_byte_str(lit, &mut |range, c| { + unescape::unescape_raw_str(lit, unescape::Mode::ByteStr, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, diff --git a/src/libsyntax/parse/literal.rs b/src/libsyntax/parse/literal.rs index 3a2d905585c0e..3711512d64e56 100644 --- a/src/libsyntax/parse/literal.rs +++ b/src/libsyntax/parse/literal.rs @@ -4,7 +4,7 @@ use crate::ast::{self, Lit, LitKind}; use crate::parse::parser::Parser; use crate::parse::PResult; use crate::parse::token::{self, Token, TokenKind}; -use crate::parse::unescape::{unescape_str, unescape_byte_str, unescape_raw_str}; +use crate::parse::unescape::{self, unescape_str, unescape_byte_str, unescape_raw_str}; use crate::parse::unescape::{unescape_char, unescape_byte}; use crate::print::pprust; use crate::symbol::{kw, sym, Symbol}; @@ -144,7 +144,7 @@ impl LitKind { let symbol = if s.contains('\r') { let mut buf = String::with_capacity(s.len()); let mut error = Ok(()); - unescape_raw_str(&s, &mut |_, unescaped_char| { + unescape_raw_str(&s, unescape::Mode::Str, &mut |_, unescaped_char| { match unescaped_char { Ok(c) => buf.push(c), Err(_) => error = Err(LitError::LexerError), diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index 819463b547250..f5b6c38083e10 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -71,7 +71,7 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. -pub(crate) fn unescape_raw_str(literal_text: &str, callback: &mut F) +pub(crate) fn unescape_raw_str(literal_text: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -79,36 +79,20 @@ where let mut chars = literal_text.chars().peekable(); while let Some(curr) = chars.next() { - let result = match (curr, chars.peek()) { - ('\r', Some('\n')) => Ok(curr), - ('\r', _) => Err(EscapeError::BareCarriageReturn), - _ => Ok(curr), + let (result, scanned) = match (curr, chars.peek()) { + ('\r', Some('\n')) => { + chars.next(); + (Ok('\n'), [Some('\r'), Some('\n')]) + }, + ('\r', _) => + (Err(EscapeError::BareCarriageReturn), [Some('\r'), None]), + (c, _) if mode.is_bytes() && c > '\x7F' => + (Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]), + (c, _) => (Ok(c), [Some(c), None]), }; - callback(byte_offset..(byte_offset + curr.len_utf8()), result); - byte_offset += curr.len_utf8(); - } -} - -/// Takes a contents of a string literal (without quotes) and produces a -/// sequence of characters or errors. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -pub(crate) fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - let mut byte_offset: usize = 0; - - let mut chars = literal_text.chars().peekable(); - while let Some(curr) = chars.next() { - let result = match (curr, chars.peek()) { - ('\r', Some('\n')) => Ok(curr), - ('\r', _) => Err(EscapeError::BareCarriageReturn), - (c, _) if c > '\x7F' => Err(EscapeError::NonAsciiCharInByteString), - _ => Ok(curr), - }; - callback(byte_offset..(byte_offset + curr.len_utf8()), result); - byte_offset += curr.len_utf8(); + let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum(); + callback(byte_offset..(byte_offset + len_utf8), result); + byte_offset += len_utf8; } } From 8cd51fff827ab4892382170f7ef6c3379c614260 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Sun, 9 Jun 2019 14:15:31 +0200 Subject: [PATCH 07/11] Add test for raw byte CRLF translation --- .../lexer-crlf-line-endings-string-literal-doc-comment.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs index 126cab67c1c04..f9d1b17b8dd82 100644 --- a/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs +++ b/src/test/run-pass/lexer-crlf-line-endings-string-literal-doc-comment.rs @@ -30,6 +30,9 @@ literal"; let s = r"string literal"; assert_eq!(s, "string\nliteral"); + let s = br"byte string +literal"; + assert_eq!(s, "byte string\nliteral".as_bytes()); // validate that our source file has CRLF endings let source = include_str!("lexer-crlf-line-endings-string-literal-doc-comment.rs"); From 3c1d352dc4a66cae9127cf677ec5e609aee1b7ae Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Sun, 9 Jun 2019 14:20:29 +0200 Subject: [PATCH 08/11] Add a doc comment for scan_raw_string --- src/libsyntax/parse/lexer/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 71fa4bdb2cf0e..7f190bd741047 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1242,6 +1242,8 @@ impl<'a> StringReader<'a> { id } + /// Scans a raw (byte) string, returning byte position range for `""` + /// (including quotes) along with `#` character count in `(b)r##...""##...`; fn scan_raw_string(&mut self) -> (BytePos, BytePos, u16) { let start_bpos = self.pos; self.bump(); From 735ac057bb94d179c81afcd0f3e63bc6e6856734 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Sun, 9 Jun 2019 14:43:31 +0200 Subject: [PATCH 09/11] Actually translate CRLF in raw byte strings and unify unescape impl --- src/libsyntax/parse/lexer/mod.rs | 4 +- src/libsyntax/parse/literal.rs | 26 +++++++++++-- src/libsyntax/parse/unescape.rs | 64 ++++++++++++++++++++++---------- 3 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 7f190bd741047..24dddcb6141c7 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -1348,7 +1348,7 @@ impl<'a> StringReader<'a> { fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) { self.with_str_from_to(content_start, content_end, |lit: &str| { - unescape::unescape_raw_str(lit, unescape::Mode::Str, &mut |range, c| { + unescape::unescape_raw_str(lit, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, @@ -1365,7 +1365,7 @@ impl<'a> StringReader<'a> { fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { self.with_str_from_to(content_start, content_end, |lit: &str| { - unescape::unescape_raw_str(lit, unescape::Mode::ByteStr, &mut |range, c| { + unescape::unescape_raw_byte_str(lit, &mut |range, c| { if let Err(err) = c { emit_unescape_error( &self.sess.span_diagnostic, diff --git a/src/libsyntax/parse/literal.rs b/src/libsyntax/parse/literal.rs index 3711512d64e56..3baa5858c9db8 100644 --- a/src/libsyntax/parse/literal.rs +++ b/src/libsyntax/parse/literal.rs @@ -4,8 +4,9 @@ use crate::ast::{self, Lit, LitKind}; use crate::parse::parser::Parser; use crate::parse::PResult; use crate::parse::token::{self, Token, TokenKind}; -use crate::parse::unescape::{self, unescape_str, unescape_byte_str, unescape_raw_str}; use crate::parse::unescape::{unescape_char, unescape_byte}; +use crate::parse::unescape::{unescape_str, unescape_byte_str}; +use crate::parse::unescape::{unescape_raw_str, unescape_raw_byte_str}; use crate::print::pprust; use crate::symbol::{kw, sym, Symbol}; use crate::tokenstream::{TokenStream, TokenTree}; @@ -144,7 +145,7 @@ impl LitKind { let symbol = if s.contains('\r') { let mut buf = String::with_capacity(s.len()); let mut error = Ok(()); - unescape_raw_str(&s, unescape::Mode::Str, &mut |_, unescaped_char| { + unescape_raw_str(&s, &mut |_, unescaped_char| { match unescaped_char { Ok(c) => buf.push(c), Err(_) => error = Err(LitError::LexerError), @@ -172,7 +173,26 @@ impl LitKind { buf.shrink_to_fit(); LitKind::ByteStr(Lrc::new(buf)) } - token::ByteStrRaw(_) => LitKind::ByteStr(Lrc::new(symbol.to_string().into_bytes())), + token::ByteStrRaw(_) => { + let s = symbol.as_str(); + let bytes = if s.contains('\r') { + let mut buf = Vec::with_capacity(s.len()); + let mut error = Ok(()); + unescape_raw_byte_str(&s, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + buf + } else { + symbol.to_string().into_bytes() + }; + + LitKind::ByteStr(Lrc::new(bytes)) + }, token::Err => LitKind::Err(symbol), }) } diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index f5b6c38083e10..da6de8a12daaa 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -71,29 +71,24 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only translate CRLF to LF and produce errors on bare CR. -pub(crate) fn unescape_raw_str(literal_text: &str, mode: Mode, callback: &mut F) +pub(crate) fn unescape_raw_str(literal_text: &str, callback: &mut F) where F: FnMut(Range, Result), { - let mut byte_offset: usize = 0; + unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) +} - let mut chars = literal_text.chars().peekable(); - while let Some(curr) = chars.next() { - let (result, scanned) = match (curr, chars.peek()) { - ('\r', Some('\n')) => { - chars.next(); - (Ok('\n'), [Some('\r'), Some('\n')]) - }, - ('\r', _) => - (Err(EscapeError::BareCarriageReturn), [Some('\r'), None]), - (c, _) if mode.is_bytes() && c > '\x7F' => - (Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]), - (c, _) => (Ok(c), [Some(c), None]), - }; - let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum(); - callback(byte_offset..(byte_offset + len_utf8), result); - byte_offset += len_utf8; - } +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub(crate) fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) +where + F: FnMut(Range, Result), +{ + unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) } #[derive(Debug, Clone, Copy)] @@ -284,9 +279,38 @@ where } } +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +fn unescape_raw_str_or_byte_str(literal_text: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range, Result), +{ + let mut byte_offset: usize = 0; + + let mut chars = literal_text.chars().peekable(); + while let Some(curr) = chars.next() { + let (result, scanned) = match (curr, chars.peek()) { + ('\r', Some('\n')) => { + chars.next(); + (Ok('\n'), [Some('\r'), Some('\n')]) + }, + ('\r', _) => + (Err(EscapeError::BareCarriageReturn), [Some('\r'), None]), + (c, _) if mode.is_bytes() && !c.is_ascii() => + (Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]), + (c, _) => (Ok(c), [Some(c), None]), + }; + let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum(); + callback(byte_offset..(byte_offset + len_utf8), result); + byte_offset += len_utf8; + } +} + fn byte_from_char(c: char) -> u8 { let res = c as u32; - assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte"); + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)"); res as u8 } From 63dc7da703759dd53536dd18a42ff65f39a2f9b4 Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Sun, 9 Jun 2019 15:44:18 +0200 Subject: [PATCH 10/11] Use char byte calculation using existing iterator --- src/libsyntax/parse/unescape.rs | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index da6de8a12daaa..e816aa0271cf6 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -287,24 +287,26 @@ fn unescape_raw_str_or_byte_str(literal_text: &str, mode: Mode, callback: &mu where F: FnMut(Range, Result), { - let mut byte_offset: usize = 0; + assert!(mode.in_double_quotes()); + let initial_len = literal_text.len(); - let mut chars = literal_text.chars().peekable(); + let mut chars = literal_text.chars(); while let Some(curr) = chars.next() { - let (result, scanned) = match (curr, chars.peek()) { + let start = initial_len - chars.as_str().len() - curr.len_utf8(); + + let result = match (curr, chars.clone().next()) { ('\r', Some('\n')) => { chars.next(); - (Ok('\n'), [Some('\r'), Some('\n')]) + Ok('\n') }, - ('\r', _) => - (Err(EscapeError::BareCarriageReturn), [Some('\r'), None]), + ('\r', _) => Err(EscapeError::BareCarriageReturn), (c, _) if mode.is_bytes() && !c.is_ascii() => - (Err(EscapeError::NonAsciiCharInByteString), [Some(c), None]), - (c, _) => (Ok(c), [Some(c), None]), + Err(EscapeError::NonAsciiCharInByteString), + (c, _) => Ok(c), }; - let len_utf8: usize = scanned.iter().filter_map(|&x| x).map(char::len_utf8).sum(); - callback(byte_offset..(byte_offset + len_utf8), result); - byte_offset += len_utf8; + let end = initial_len - chars.as_str().len(); + + callback(start..end, result); } } From 630d5f355fc85fc2c3bab28a278c517d945d328d Mon Sep 17 00:00:00 2001 From: Igor Matuszewski Date: Mon, 10 Jun 2019 17:32:15 +0200 Subject: [PATCH 11/11] Don't suggest using \r in raw strings --- src/libsyntax/parse/unescape.rs | 3 ++- src/libsyntax/parse/unescape_error_reporting.rs | 5 +++++ src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs | 2 +- .../ui/parser/lex-bare-cr-string-literal-doc-comment.stderr | 2 +- src/test/ui/parser/raw-byte-string-literals.rs | 2 +- src/test/ui/parser/raw-byte-string-literals.stderr | 2 +- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/libsyntax/parse/unescape.rs b/src/libsyntax/parse/unescape.rs index e816aa0271cf6..22cce67b5eeb7 100644 --- a/src/libsyntax/parse/unescape.rs +++ b/src/libsyntax/parse/unescape.rs @@ -12,6 +12,7 @@ pub(crate) enum EscapeError { LoneSlash, InvalidEscape, BareCarriageReturn, + BareCarriageReturnInRawString, EscapeOnlyChar, TooShortHexEscape, @@ -299,7 +300,7 @@ where chars.next(); Ok('\n') }, - ('\r', _) => Err(EscapeError::BareCarriageReturn), + ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString), (c, _) if mode.is_bytes() && !c.is_ascii() => Err(EscapeError::NonAsciiCharInByteString), (c, _) => Ok(c), diff --git a/src/libsyntax/parse/unescape_error_reporting.rs b/src/libsyntax/parse/unescape_error_reporting.rs index 8f152974a6d3f..71b41161ad8c6 100644 --- a/src/libsyntax/parse/unescape_error_reporting.rs +++ b/src/libsyntax/parse/unescape_error_reporting.rs @@ -80,6 +80,11 @@ pub(crate) fn emit_unescape_error( }; handler.span_err(span, msg); } + EscapeError::BareCarriageReturnInRawString => { + assert!(mode.in_double_quotes()); + let msg = "bare CR not allowed in raw string"; + handler.span_err(span, msg); + } EscapeError::InvalidEscape => { let (c, span) = last_char(); diff --git a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs index ed5df42f9dd4e..b588b007ae929 100644 --- a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs +++ b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.rs @@ -21,7 +21,7 @@ fn main() { let _s = "foo bar"; //~ ERROR: bare CR not allowed in string // the following string literal has a bare CR in it - let _s = r"bar foo"; //~ ERROR: bare CR not allowed in string + let _s = r"bar foo"; //~ ERROR: bare CR not allowed in raw string // the following string literal has a bare CR in it let _s = "foo\ bar"; //~ ERROR: unknown character escape: \r diff --git a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr index 153237a7f71b4..b0fe4b6acd484 100644 --- a/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr +++ b/src/test/ui/parser/lex-bare-cr-string-literal-doc-comment.stderr @@ -28,7 +28,7 @@ error: bare CR not allowed in string, use \r instead LL | let _s = "foo bar"; | ^ -error: bare CR not allowed in string, use \r instead +error: bare CR not allowed in raw string --> $DIR/lex-bare-cr-string-literal-doc-comment.rs:24:19 | LL | let _s = r"bar foo"; diff --git a/src/test/ui/parser/raw-byte-string-literals.rs b/src/test/ui/parser/raw-byte-string-literals.rs index 87ecfb5c5445c..534afabdf777e 100644 --- a/src/test/ui/parser/raw-byte-string-literals.rs +++ b/src/test/ui/parser/raw-byte-string-literals.rs @@ -1,7 +1,7 @@ // ignore-tidy-cr // compile-flags: -Z continue-parse-after-error pub fn main() { - br"a "; //~ ERROR bare CR not allowed in string + br"a "; //~ ERROR bare CR not allowed in raw string br"é"; //~ ERROR raw byte string must be ASCII br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/src/test/ui/parser/raw-byte-string-literals.stderr b/src/test/ui/parser/raw-byte-string-literals.stderr index 03fe79722b844..4880d1fdbe8a7 100644 --- a/src/test/ui/parser/raw-byte-string-literals.stderr +++ b/src/test/ui/parser/raw-byte-string-literals.stderr @@ -1,4 +1,4 @@ -error: bare CR not allowed in string, use \r instead +error: bare CR not allowed in raw string --> $DIR/raw-byte-string-literals.rs:4:9 | LL | br"a ";