From 0d8245b5b1f587ed2e52b6c97732299bdf7368df Mon Sep 17 00:00:00 2001 From: Fabian Wolff Date: Thu, 9 Sep 2021 23:25:43 +0200 Subject: [PATCH] Improve diagnostics if a character literal contains combining marks --- .../src/lexer/unescape_error_reporting.rs | 61 ++++++++++++++----- .../ui/parser/unicode-character-literal.fixed | 21 +++++++ .../ui/parser/unicode-character-literal.rs | 21 +++++++ .../parser/unicode-character-literal.stderr | 48 +++++++++++++++ 4 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 src/test/ui/parser/unicode-character-literal.fixed create mode 100644 src/test/ui/parser/unicode-character-literal.rs create mode 100644 src/test/ui/parser/unicode-character-literal.stderr diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index aa6b424ce2b57..cef5b3a226bff 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -3,7 +3,7 @@ use std::iter::once; use std::ops::Range; -use rustc_errors::{Applicability, Handler}; +use rustc_errors::{pluralize, Applicability, Handler}; use rustc_lexer::unescape::{EscapeError, Mode}; use rustc_span::{BytePos, Span}; @@ -49,24 +49,57 @@ pub(crate) fn emit_unescape_error( .emit(); } EscapeError::MoreThanOneChar => { - let (prefix, msg) = if mode.is_bytes() { - ("b", "if you meant to write a byte string literal, use double quotes") - } else { - ("", "if you meant to write a `str` literal, use double quotes") - }; + use unicode_normalization::{char::is_combining_mark, UnicodeNormalization}; - handler - .struct_span_err( - span_with_quotes, - "character literal may only contain one codepoint", - ) - .span_suggestion( + let mut has_help = false; + let mut handler = handler.struct_span_err( + span_with_quotes, + "character literal may only contain one codepoint", + ); + + if lit.chars().skip(1).all(|c| is_combining_mark(c)) { + let escaped_marks = + lit.chars().skip(1).map(|c| c.escape_default().to_string()).collect::>(); + handler.span_note( + span, + &format!( + "this `{}` is followed by the combining mark{} `{}`", + lit.chars().next().unwrap(), + pluralize!(escaped_marks.len()), + escaped_marks.join(""), + ), + ); + let normalized = lit.nfc().to_string(); + if normalized.chars().count() == 1 { + has_help = true; + handler.span_suggestion( + span, + &format!( + "consider using the normalized form `{}` of this character", + normalized.chars().next().unwrap().escape_default() + ), + normalized, + Applicability::MachineApplicable, + ); + } + } + + if !has_help { + let (prefix, msg) = if mode.is_bytes() { + ("b", "if you meant to write a byte string literal, use double quotes") + } else { + ("", "if you meant to write a `str` literal, use double quotes") + }; + + handler.span_suggestion( span_with_quotes, msg, format!("{}\"{}\"", prefix, lit), Applicability::MachineApplicable, - ) - .emit(); + ); + } + + handler.emit(); } EscapeError::EscapeOnlyChar => { let (c, char_span) = last_char(); diff --git a/src/test/ui/parser/unicode-character-literal.fixed b/src/test/ui/parser/unicode-character-literal.fixed new file mode 100644 index 0000000000000..26ef5ffa11a80 --- /dev/null +++ b/src/test/ui/parser/unicode-character-literal.fixed @@ -0,0 +1,21 @@ +// Regression test for #88684: Improve diagnostics for combining marks +// in character literals. + +// run-rustfix + +fn main() { + let _spade = "♠️"; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `♠` is followed by the combining mark `\u{fe0f}` + //~| HELP: if you meant to write a `str` literal, use double quotes + + let _s = "ṩ̂̊"; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}` + //~| HELP: if you meant to write a `str` literal, use double quotes + + let _a = 'Å'; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `A` is followed by the combining mark `\u{30a}` + //~| HELP: consider using the normalized form `\u{c5}` of this character +} diff --git a/src/test/ui/parser/unicode-character-literal.rs b/src/test/ui/parser/unicode-character-literal.rs new file mode 100644 index 0000000000000..d331522c04cbb --- /dev/null +++ b/src/test/ui/parser/unicode-character-literal.rs @@ -0,0 +1,21 @@ +// Regression test for #88684: Improve diagnostics for combining marks +// in character literals. + +// run-rustfix + +fn main() { + let _spade = '♠️'; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `♠` is followed by the combining mark `\u{fe0f}` + //~| HELP: if you meant to write a `str` literal, use double quotes + + let _s = 'ṩ̂̊'; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}` + //~| HELP: if you meant to write a `str` literal, use double quotes + + let _a = 'Å'; + //~^ ERROR: character literal may only contain one codepoint + //~| NOTE: this `A` is followed by the combining mark `\u{30a}` + //~| HELP: consider using the normalized form `\u{c5}` of this character +} diff --git a/src/test/ui/parser/unicode-character-literal.stderr b/src/test/ui/parser/unicode-character-literal.stderr new file mode 100644 index 0000000000000..5cd3bd0fe69d7 --- /dev/null +++ b/src/test/ui/parser/unicode-character-literal.stderr @@ -0,0 +1,48 @@ +error: character literal may only contain one codepoint + --> $DIR/unicode-character-literal.rs:7:18 + | +LL | let _spade = '♠️'; + | ^^^ + | +note: this `♠` is followed by the combining mark `\u{fe0f}` + --> $DIR/unicode-character-literal.rs:7:19 + | +LL | let _spade = '♠️'; + | ^ +help: if you meant to write a `str` literal, use double quotes + | +LL | let _spade = "♠️"; + | ~~~ + +error: character literal may only contain one codepoint + --> $DIR/unicode-character-literal.rs:12:14 + | +LL | let _s = 'ṩ̂̊'; + | ^^^ + | +note: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}` + --> $DIR/unicode-character-literal.rs:12:15 + | +LL | let _s = 'ṩ̂̊'; + | ^ +help: if you meant to write a `str` literal, use double quotes + | +LL | let _s = "ṩ̂̊"; + | ~~~ + +error: character literal may only contain one codepoint + --> $DIR/unicode-character-literal.rs:17:14 + | +LL | let _a = 'Å'; + | ^-^ + | | + | help: consider using the normalized form `\u{c5}` of this character: `Å` + | +note: this `A` is followed by the combining mark `\u{30a}` + --> $DIR/unicode-character-literal.rs:17:15 + | +LL | let _a = 'Å'; + | ^ + +error: aborting due to 3 previous errors +