From 4d4ec97e0a11eae5878ae6715536875c5689f074 Mon Sep 17 00:00:00 2001
From: George Bateman <george.bateman16@gmail.com>
Date: Sun, 30 Jan 2022 22:16:41 +0000
Subject: [PATCH 1/3] Document char validity

---
 library/core/src/primitive_docs.rs | 38 ++++++++++++++++++++++++++----
 library/std/src/primitive_docs.rs  | 38 ++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 10 deletions(-)

diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs
index 8fcd8cdeb1042..e8b4fffbdd275 100644
--- a/library/core/src/primitive_docs.rs
+++ b/library/core/src/primitive_docs.rs
@@ -279,16 +279,44 @@ mod prim_never {}
 ///
 /// The `char` type represents a single character. More specifically, since
 /// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
-/// scalar value]', which is similar to, but not the same as, a '[Unicode code
-/// point]'.
-///
-/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
-/// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// scalar value]'.
 ///
 /// This documentation describes a number of methods and trait implementations on the
 /// `char` type. For technical reasons, there is additional, separate
 /// documentation in [the `std::char` module](char/index.html) as well.
 ///
+/// # Validity
+///
+/// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]'
+/// other than a [surrogate code point]. This has a fixed numerical definition:
+/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive.
+/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF.
+///
+/// No `char` may be constructed, whether as a literal or at runtime, that is not a
+/// Unicode scalar value:
+///
+/// ```text
+/// let forbidden_chars = [
+///     // Each of these is a compiler error
+///     '\u{D800}', '\u{DFFF}', '\u{110000}',
+///
+///     // Panics; from_u32 returns None.
+///     char::from_u32(0xDE01).unwrap(),
+///
+///     // Undefined behaviour
+///     unsafe { char::from_u32_unchecked(0x110000) },
+/// ];
+/// ```
+///
+/// Unicode is regularly updated. Many USVs are not currently assigned to a
+/// character, but may be in the future ("reserved"); some will never be a character
+/// ("noncharacters"); and some may be given different meanings by different users
+/// ("private use").
+///
+/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
+/// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point
+///
 /// # Representation
 ///
 /// `char` is always four bytes in size. This is a different representation than
diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs
index 8fcd8cdeb1042..e8b4fffbdd275 100644
--- a/library/std/src/primitive_docs.rs
+++ b/library/std/src/primitive_docs.rs
@@ -279,16 +279,44 @@ mod prim_never {}
 ///
 /// The `char` type represents a single character. More specifically, since
 /// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
-/// scalar value]', which is similar to, but not the same as, a '[Unicode code
-/// point]'.
-///
-/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
-/// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// scalar value]'.
 ///
 /// This documentation describes a number of methods and trait implementations on the
 /// `char` type. For technical reasons, there is additional, separate
 /// documentation in [the `std::char` module](char/index.html) as well.
 ///
+/// # Validity
+///
+/// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]'
+/// other than a [surrogate code point]. This has a fixed numerical definition:
+/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive.
+/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF.
+///
+/// No `char` may be constructed, whether as a literal or at runtime, that is not a
+/// Unicode scalar value:
+///
+/// ```text
+/// let forbidden_chars = [
+///     // Each of these is a compiler error
+///     '\u{D800}', '\u{DFFF}', '\u{110000}',
+///
+///     // Panics; from_u32 returns None.
+///     char::from_u32(0xDE01).unwrap(),
+///
+///     // Undefined behaviour
+///     unsafe { char::from_u32_unchecked(0x110000) },
+/// ];
+/// ```
+///
+/// Unicode is regularly updated. Many USVs are not currently assigned to a
+/// character, but may be in the future ("reserved"); some will never be a character
+/// ("noncharacters"); and some may be given different meanings by different users
+/// ("private use").
+///
+/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
+/// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point
+///
 /// # Representation
 ///
 /// `char` is always four bytes in size. This is a different representation than

From 5357ec1473c8a44dd8e324b2c664951bf4306b5a Mon Sep 17 00:00:00 2001
From: George Bateman <george.bateman16@gmail.com>
Date: Mon, 31 Jan 2022 23:49:16 +0000
Subject: [PATCH 2/3] (#93493) Add items from code review

---
 library/core/src/primitive_docs.rs | 50 +++++++++++++++++++++---------
 library/std/src/primitive_docs.rs  | 50 +++++++++++++++++++++---------
 2 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs
index e8b4fffbdd275..b9d71178921f7 100644
--- a/library/core/src/primitive_docs.rs
+++ b/library/core/src/primitive_docs.rs
@@ -289,32 +289,52 @@ mod prim_never {}
 ///
 /// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]'
 /// other than a [surrogate code point]. This has a fixed numerical definition:
-/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive.
-/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF.
+/// code points are in the range 0 to 0x10FFFF, inclusive.
+/// Surrogate code points, used by UTF-16, are in the range 0xD800 to 0xDFFF.
 ///
 /// No `char` may be constructed, whether as a literal or at runtime, that is not a
 /// Unicode scalar value:
 ///
 /// ```text
-/// let forbidden_chars = [
-///     // Each of these is a compiler error
-///     '\u{D800}', '\u{DFFF}', '\u{110000}',
+/// // Each of these is a compiler error
+/// ['\u{D800}', '\u{DFFF}', '\u{110000}'];
+/// ```
 ///
-///     // Panics; from_u32 returns None.
-///     char::from_u32(0xDE01).unwrap(),
+/// ```should_panic
+/// // Panics; from_u32 returns None.
+/// char::from_u32(0xDE01).unwrap();
+/// ```
 ///
-///     // Undefined behaviour
-///     unsafe { char::from_u32_unchecked(0x110000) },
-/// ];
+/// ```
+/// // Undefined behaviour
+/// unsafe { char::from_u32_unchecked(0x110000) };
 /// ```
 ///
-/// Unicode is regularly updated. Many USVs are not currently assigned to a
-/// character, but may be in the future ("reserved"); some will never be a character
-/// ("noncharacters"); and some may be given different meanings by different users
-/// ("private use").
+/// USVs are also the exact set of values that may be encoded in UTF-8. Because
+/// `char` values are USVs and `str` values are valid UTF-8, it is safe to store
+/// any `char` in a `str` or read any character from a `str` as a `char`.
+///
+/// The gap in valid `char` values is understood by the compiler, so in the
+/// below example the two ranges are understood to cover the whole range of
+/// possible `char` values and there is no error for a [non-exhaustive match].
+///
+/// ```
+/// let c: char = 'a';
+/// match c {
+///     '\0' ..= '\u{D7FF}' => false,
+///     '\u{E000}' ..= '\u{10FFFF}' => true,
+/// };
+/// ```
+///
+/// All USVs are valid `char` values, but not all of them represent a real
+/// character. Many USVs are not currently assigned to a character, but may be
+/// in the future ("reserved"); some will never be a character
+/// ("noncharacters"); and some may be given different meanings by different
+/// users ("private use").
 ///
-/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
 /// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
+/// [non-exhaustive match]: ../book/ch06-02-match.html#matches-are-exhaustive
 /// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point
 ///
 /// # Representation
diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs
index e8b4fffbdd275..b9d71178921f7 100644
--- a/library/std/src/primitive_docs.rs
+++ b/library/std/src/primitive_docs.rs
@@ -289,32 +289,52 @@ mod prim_never {}
 ///
 /// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]'
 /// other than a [surrogate code point]. This has a fixed numerical definition:
-/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive.
-/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF.
+/// code points are in the range 0 to 0x10FFFF, inclusive.
+/// Surrogate code points, used by UTF-16, are in the range 0xD800 to 0xDFFF.
 ///
 /// No `char` may be constructed, whether as a literal or at runtime, that is not a
 /// Unicode scalar value:
 ///
 /// ```text
-/// let forbidden_chars = [
-///     // Each of these is a compiler error
-///     '\u{D800}', '\u{DFFF}', '\u{110000}',
+/// // Each of these is a compiler error
+/// ['\u{D800}', '\u{DFFF}', '\u{110000}'];
+/// ```
 ///
-///     // Panics; from_u32 returns None.
-///     char::from_u32(0xDE01).unwrap(),
+/// ```should_panic
+/// // Panics; from_u32 returns None.
+/// char::from_u32(0xDE01).unwrap();
+/// ```
 ///
-///     // Undefined behaviour
-///     unsafe { char::from_u32_unchecked(0x110000) },
-/// ];
+/// ```
+/// // Undefined behaviour
+/// unsafe { char::from_u32_unchecked(0x110000) };
 /// ```
 ///
-/// Unicode is regularly updated. Many USVs are not currently assigned to a
-/// character, but may be in the future ("reserved"); some will never be a character
-/// ("noncharacters"); and some may be given different meanings by different users
-/// ("private use").
+/// USVs are also the exact set of values that may be encoded in UTF-8. Because
+/// `char` values are USVs and `str` values are valid UTF-8, it is safe to store
+/// any `char` in a `str` or read any character from a `str` as a `char`.
+///
+/// The gap in valid `char` values is understood by the compiler, so in the
+/// below example the two ranges are understood to cover the whole range of
+/// possible `char` values and there is no error for a [non-exhaustive match].
+///
+/// ```
+/// let c: char = 'a';
+/// match c {
+///     '\0' ..= '\u{D7FF}' => false,
+///     '\u{E000}' ..= '\u{10FFFF}' => true,
+/// };
+/// ```
+///
+/// All USVs are valid `char` values, but not all of them represent a real
+/// character. Many USVs are not currently assigned to a character, but may be
+/// in the future ("reserved"); some will never be a character
+/// ("noncharacters"); and some may be given different meanings by different
+/// users ("private use").
 ///
-/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
 /// [Unicode code point]: https://www.unicode.org/glossary/#code_point
+/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value
+/// [non-exhaustive match]: ../book/ch06-02-match.html#matches-are-exhaustive
 /// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point
 ///
 /// # Representation

From d372baf3f9a6401fdd4f74e3385ba553667b287d Mon Sep 17 00:00:00 2001
From: George Bateman <george.bateman16@gmail.com>
Date: Tue, 1 Feb 2022 21:44:53 +0000
Subject: [PATCH 3/3] Fix annotation of code blocks

---
 library/core/src/primitive_docs.rs | 5 +++--
 library/std/src/primitive_docs.rs  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs
index b9d71178921f7..ebb1d8971b99d 100644
--- a/library/core/src/primitive_docs.rs
+++ b/library/core/src/primitive_docs.rs
@@ -275,6 +275,7 @@ mod prim_bool {}
 mod prim_never {}
 
 #[doc(primitive = "char")]
+#[allow(rustdoc::invalid_rust_codeblocks)]
 /// A character type.
 ///
 /// The `char` type represents a single character. More specifically, since
@@ -295,7 +296,7 @@ mod prim_never {}
 /// No `char` may be constructed, whether as a literal or at runtime, that is not a
 /// Unicode scalar value:
 ///
-/// ```text
+/// ```compile_fail
 /// // Each of these is a compiler error
 /// ['\u{D800}', '\u{DFFF}', '\u{110000}'];
 /// ```
@@ -305,7 +306,7 @@ mod prim_never {}
 /// char::from_u32(0xDE01).unwrap();
 /// ```
 ///
-/// ```
+/// ```no_run
 /// // Undefined behaviour
 /// unsafe { char::from_u32_unchecked(0x110000) };
 /// ```
diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs
index b9d71178921f7..ebb1d8971b99d 100644
--- a/library/std/src/primitive_docs.rs
+++ b/library/std/src/primitive_docs.rs
@@ -275,6 +275,7 @@ mod prim_bool {}
 mod prim_never {}
 
 #[doc(primitive = "char")]
+#[allow(rustdoc::invalid_rust_codeblocks)]
 /// A character type.
 ///
 /// The `char` type represents a single character. More specifically, since
@@ -295,7 +296,7 @@ mod prim_never {}
 /// No `char` may be constructed, whether as a literal or at runtime, that is not a
 /// Unicode scalar value:
 ///
-/// ```text
+/// ```compile_fail
 /// // Each of these is a compiler error
 /// ['\u{D800}', '\u{DFFF}', '\u{110000}'];
 /// ```
@@ -305,7 +306,7 @@ mod prim_never {}
 /// char::from_u32(0xDE01).unwrap();
 /// ```
 ///
-/// ```
+/// ```no_run
 /// // Undefined behaviour
 /// unsafe { char::from_u32_unchecked(0x110000) };
 /// ```