From 0d8245b5b1f587ed2e52b6c97732299bdf7368df Mon Sep 17 00:00:00 2001
From: Fabian Wolff <fabian.wolff@alumni.ethz.ch>
Date: Thu, 9 Sep 2021 23:25:43 +0200
Subject: [PATCH] Improve diagnostics if a character literal contains combining
 marks

---
 .../src/lexer/unescape_error_reporting.rs     | 61 ++++++++++++++-----
 .../ui/parser/unicode-character-literal.fixed | 21 +++++++
 .../ui/parser/unicode-character-literal.rs    | 21 +++++++
 .../parser/unicode-character-literal.stderr   | 48 +++++++++++++++
 4 files changed, 137 insertions(+), 14 deletions(-)
 create mode 100644 src/test/ui/parser/unicode-character-literal.fixed
 create mode 100644 src/test/ui/parser/unicode-character-literal.rs
 create mode 100644 src/test/ui/parser/unicode-character-literal.stderr

diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
index aa6b424ce2b57..cef5b3a226bff 100644
--- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
+++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
@@ -3,7 +3,7 @@
 use std::iter::once;
 use std::ops::Range;
 
-use rustc_errors::{Applicability, Handler};
+use rustc_errors::{pluralize, Applicability, Handler};
 use rustc_lexer::unescape::{EscapeError, Mode};
 use rustc_span::{BytePos, Span};
 
@@ -49,24 +49,57 @@ pub(crate) fn emit_unescape_error(
                 .emit();
         }
         EscapeError::MoreThanOneChar => {
-            let (prefix, msg) = if mode.is_bytes() {
-                ("b", "if you meant to write a byte string literal, use double quotes")
-            } else {
-                ("", "if you meant to write a `str` literal, use double quotes")
-            };
+            use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
 
-            handler
-                .struct_span_err(
-                    span_with_quotes,
-                    "character literal may only contain one codepoint",
-                )
-                .span_suggestion(
+            let mut has_help = false;
+            let mut handler = handler.struct_span_err(
+                span_with_quotes,
+                "character literal may only contain one codepoint",
+            );
+
+            if lit.chars().skip(1).all(|c| is_combining_mark(c)) {
+                let escaped_marks =
+                    lit.chars().skip(1).map(|c| c.escape_default().to_string()).collect::<Vec<_>>();
+                handler.span_note(
+                    span,
+                    &format!(
+                        "this `{}` is followed by the combining mark{} `{}`",
+                        lit.chars().next().unwrap(),
+                        pluralize!(escaped_marks.len()),
+                        escaped_marks.join(""),
+                    ),
+                );
+                let normalized = lit.nfc().to_string();
+                if normalized.chars().count() == 1 {
+                    has_help = true;
+                    handler.span_suggestion(
+                        span,
+                        &format!(
+                            "consider using the normalized form `{}` of this character",
+                            normalized.chars().next().unwrap().escape_default()
+                        ),
+                        normalized,
+                        Applicability::MachineApplicable,
+                    );
+                }
+            }
+
+            if !has_help {
+                let (prefix, msg) = if mode.is_bytes() {
+                    ("b", "if you meant to write a byte string literal, use double quotes")
+                } else {
+                    ("", "if you meant to write a `str` literal, use double quotes")
+                };
+
+                handler.span_suggestion(
                     span_with_quotes,
                     msg,
                     format!("{}\"{}\"", prefix, lit),
                     Applicability::MachineApplicable,
-                )
-                .emit();
+                );
+            }
+
+            handler.emit();
         }
         EscapeError::EscapeOnlyChar => {
             let (c, char_span) = last_char();
diff --git a/src/test/ui/parser/unicode-character-literal.fixed b/src/test/ui/parser/unicode-character-literal.fixed
new file mode 100644
index 0000000000000..26ef5ffa11a80
--- /dev/null
+++ b/src/test/ui/parser/unicode-character-literal.fixed
@@ -0,0 +1,21 @@
+// Regression test for #88684: Improve diagnostics for combining marks
+// in character literals.
+
+// run-rustfix
+
+fn main() {
+    let _spade = "♠️";
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `♠` is followed by the combining mark `\u{fe0f}`
+    //~| HELP: if you meant to write a `str` literal, use double quotes
+
+    let _s = "ṩ̂̊";
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}`
+    //~| HELP: if you meant to write a `str` literal, use double quotes
+
+    let _a = 'Å';
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `A` is followed by the combining mark `\u{30a}`
+    //~| HELP: consider using the normalized form `\u{c5}` of this character
+}
diff --git a/src/test/ui/parser/unicode-character-literal.rs b/src/test/ui/parser/unicode-character-literal.rs
new file mode 100644
index 0000000000000..d331522c04cbb
--- /dev/null
+++ b/src/test/ui/parser/unicode-character-literal.rs
@@ -0,0 +1,21 @@
+// Regression test for #88684: Improve diagnostics for combining marks
+// in character literals.
+
+// run-rustfix
+
+fn main() {
+    let _spade = '♠️';
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `♠` is followed by the combining mark `\u{fe0f}`
+    //~| HELP: if you meant to write a `str` literal, use double quotes
+
+    let _s = 'ṩ̂̊';
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}`
+    //~| HELP: if you meant to write a `str` literal, use double quotes
+
+    let _a = 'Å';
+    //~^ ERROR: character literal may only contain one codepoint
+    //~| NOTE: this `A` is followed by the combining mark `\u{30a}`
+    //~| HELP: consider using the normalized form `\u{c5}` of this character
+}
diff --git a/src/test/ui/parser/unicode-character-literal.stderr b/src/test/ui/parser/unicode-character-literal.stderr
new file mode 100644
index 0000000000000..5cd3bd0fe69d7
--- /dev/null
+++ b/src/test/ui/parser/unicode-character-literal.stderr
@@ -0,0 +1,48 @@
+error: character literal may only contain one codepoint
+  --> $DIR/unicode-character-literal.rs:7:18
+   |
+LL |     let _spade = '♠️';
+   |                  ^^^
+   |
+note: this `♠` is followed by the combining mark `\u{fe0f}`
+  --> $DIR/unicode-character-literal.rs:7:19
+   |
+LL |     let _spade = '♠️';
+   |                   ^
+help: if you meant to write a `str` literal, use double quotes
+   |
+LL |     let _spade = "♠️";
+   |                  ~~~
+
+error: character literal may only contain one codepoint
+  --> $DIR/unicode-character-literal.rs:12:14
+   |
+LL |     let _s = 'ṩ̂̊';
+   |              ^^^
+   |
+note: this `s` is followed by the combining marks `\u{323}\u{307}\u{302}\u{30a}`
+  --> $DIR/unicode-character-literal.rs:12:15
+   |
+LL |     let _s = 'ṩ̂̊';
+   |               ^
+help: if you meant to write a `str` literal, use double quotes
+   |
+LL |     let _s = "ṩ̂̊";
+   |              ~~~
+
+error: character literal may only contain one codepoint
+  --> $DIR/unicode-character-literal.rs:17:14
+   |
+LL |     let _a = 'Å';
+   |              ^-^
+   |               |
+   |               help: consider using the normalized form `\u{c5}` of this character: `Å`
+   |
+note: this `A` is followed by the combining mark `\u{30a}`
+  --> $DIR/unicode-character-literal.rs:17:15
+   |
+LL |     let _a = 'Å';
+   |               ^
+
+error: aborting due to 3 previous errors
+