From 4d4ec97e0a11eae5878ae6715536875c5689f074 Mon Sep 17 00:00:00 2001 From: George Bateman Date: Sun, 30 Jan 2022 22:16:41 +0000 Subject: [PATCH 1/3] Document char validity --- library/core/src/primitive_docs.rs | 38 ++++++++++++++++++++++++++---- library/std/src/primitive_docs.rs | 38 ++++++++++++++++++++++++++---- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs index 8fcd8cdeb1042..e8b4fffbdd275 100644 --- a/library/core/src/primitive_docs.rs +++ b/library/core/src/primitive_docs.rs @@ -279,16 +279,44 @@ mod prim_never {} /// /// The `char` type represents a single character. More specifically, since /// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode -/// scalar value]', which is similar to, but not the same as, a '[Unicode code -/// point]'. -/// -/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value -/// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// scalar value]'. /// /// This documentation describes a number of methods and trait implementations on the /// `char` type. For technical reasons, there is additional, separate /// documentation in [the `std::char` module](char/index.html) as well. /// +/// # Validity +/// +/// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]' +/// other than a [surrogate code point]. This has a fixed numerical definition: +/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive. +/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF. +/// +/// No `char` may be constructed, whether as a literal or at runtime, that is not a +/// Unicode scalar value: +/// +/// ```text +/// let forbidden_chars = [ +/// // Each of these is a compiler error +/// '\u{D800}', '\u{DFFF}', '\u{110000}', +/// +/// // Panics; from_u32 returns None. +/// char::from_u32(0xDE01).unwrap(), +/// +/// // Undefined behaviour +/// unsafe { char::from_u32_unchecked(0x110000) }, +/// ]; +/// ``` +/// +/// Unicode is regularly updated. Many USVs are not currently assigned to a +/// character, but may be in the future ("reserved"); some will never be a character +/// ("noncharacters"); and some may be given different meanings by different users +/// ("private use"). +/// +/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value +/// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point +/// /// # Representation /// /// `char` is always four bytes in size. This is a different representation than diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs index 8fcd8cdeb1042..e8b4fffbdd275 100644 --- a/library/std/src/primitive_docs.rs +++ b/library/std/src/primitive_docs.rs @@ -279,16 +279,44 @@ mod prim_never {} /// /// The `char` type represents a single character. More specifically, since /// 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode -/// scalar value]', which is similar to, but not the same as, a '[Unicode code -/// point]'. -/// -/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value -/// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// scalar value]'. /// /// This documentation describes a number of methods and trait implementations on the /// `char` type. For technical reasons, there is additional, separate /// documentation in [the `std::char` module](char/index.html) as well. /// +/// # Validity +/// +/// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]' +/// other than a [surrogate code point]. This has a fixed numerical definition: +/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive. +/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF. +/// +/// No `char` may be constructed, whether as a literal or at runtime, that is not a +/// Unicode scalar value: +/// +/// ```text +/// let forbidden_chars = [ +/// // Each of these is a compiler error +/// '\u{D800}', '\u{DFFF}', '\u{110000}', +/// +/// // Panics; from_u32 returns None. +/// char::from_u32(0xDE01).unwrap(), +/// +/// // Undefined behaviour +/// unsafe { char::from_u32_unchecked(0x110000) }, +/// ]; +/// ``` +/// +/// Unicode is regularly updated. Many USVs are not currently assigned to a +/// character, but may be in the future ("reserved"); some will never be a character +/// ("noncharacters"); and some may be given different meanings by different users +/// ("private use"). +/// +/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value +/// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point +/// /// # Representation /// /// `char` is always four bytes in size. This is a different representation than From 5357ec1473c8a44dd8e324b2c664951bf4306b5a Mon Sep 17 00:00:00 2001 From: George Bateman Date: Mon, 31 Jan 2022 23:49:16 +0000 Subject: [PATCH 2/3] (#93493) Add items from code review --- library/core/src/primitive_docs.rs | 50 +++++++++++++++++++++--------- library/std/src/primitive_docs.rs | 50 +++++++++++++++++++++--------- 2 files changed, 70 insertions(+), 30 deletions(-) diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs index e8b4fffbdd275..b9d71178921f7 100644 --- a/library/core/src/primitive_docs.rs +++ b/library/core/src/primitive_docs.rs @@ -289,32 +289,52 @@ mod prim_never {} /// /// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]' /// other than a [surrogate code point]. This has a fixed numerical definition: -/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive. -/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF. +/// code points are in the range 0 to 0x10FFFF, inclusive. +/// Surrogate code points, used by UTF-16, are in the range 0xD800 to 0xDFFF. /// /// No `char` may be constructed, whether as a literal or at runtime, that is not a /// Unicode scalar value: /// /// ```text -/// let forbidden_chars = [ -/// // Each of these is a compiler error -/// '\u{D800}', '\u{DFFF}', '\u{110000}', +/// // Each of these is a compiler error +/// ['\u{D800}', '\u{DFFF}', '\u{110000}']; +/// ``` /// -/// // Panics; from_u32 returns None. -/// char::from_u32(0xDE01).unwrap(), +/// ```should_panic +/// // Panics; from_u32 returns None. +/// char::from_u32(0xDE01).unwrap(); +/// ``` /// -/// // Undefined behaviour -/// unsafe { char::from_u32_unchecked(0x110000) }, -/// ]; +/// ``` +/// // Undefined behaviour +/// unsafe { char::from_u32_unchecked(0x110000) }; /// ``` /// -/// Unicode is regularly updated. Many USVs are not currently assigned to a -/// character, but may be in the future ("reserved"); some will never be a character -/// ("noncharacters"); and some may be given different meanings by different users -/// ("private use"). +/// USVs are also the exact set of values that may be encoded in UTF-8. Because +/// `char` values are USVs and `str` values are valid UTF-8, it is safe to store +/// any `char` in a `str` or read any character from a `str` as a `char`. +/// +/// The gap in valid `char` values is understood by the compiler, so in the +/// below example the two ranges are understood to cover the whole range of +/// possible `char` values and there is no error for a [non-exhaustive match]. +/// +/// ``` +/// let c: char = 'a'; +/// match c { +/// '\0' ..= '\u{D7FF}' => false, +/// '\u{E000}' ..= '\u{10FFFF}' => true, +/// }; +/// ``` +/// +/// All USVs are valid `char` values, but not all of them represent a real +/// character. Many USVs are not currently assigned to a character, but may be +/// in the future ("reserved"); some will never be a character +/// ("noncharacters"); and some may be given different meanings by different +/// users ("private use"). /// -/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value /// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value +/// [non-exhaustive match]: ../book/ch06-02-match.html#matches-are-exhaustive /// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point /// /// # Representation diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs index e8b4fffbdd275..b9d71178921f7 100644 --- a/library/std/src/primitive_docs.rs +++ b/library/std/src/primitive_docs.rs @@ -289,32 +289,52 @@ mod prim_never {} /// /// A `char` is a '[Unicode scalar value]', which is any '[Unicode code point]' /// other than a [surrogate code point]. This has a fixed numerical definition: -/// code points are in the range `'\0'` to `char::MAX` (`'\u{10FFFF}'`), inclusive. -/// Surrogate code points, used by UTF-16, are in the range U+D800 to U+DFFF. +/// code points are in the range 0 to 0x10FFFF, inclusive. +/// Surrogate code points, used by UTF-16, are in the range 0xD800 to 0xDFFF. /// /// No `char` may be constructed, whether as a literal or at runtime, that is not a /// Unicode scalar value: /// /// ```text -/// let forbidden_chars = [ -/// // Each of these is a compiler error -/// '\u{D800}', '\u{DFFF}', '\u{110000}', +/// // Each of these is a compiler error +/// ['\u{D800}', '\u{DFFF}', '\u{110000}']; +/// ``` /// -/// // Panics; from_u32 returns None. -/// char::from_u32(0xDE01).unwrap(), +/// ```should_panic +/// // Panics; from_u32 returns None. +/// char::from_u32(0xDE01).unwrap(); +/// ``` /// -/// // Undefined behaviour -/// unsafe { char::from_u32_unchecked(0x110000) }, -/// ]; +/// ``` +/// // Undefined behaviour +/// unsafe { char::from_u32_unchecked(0x110000) }; /// ``` /// -/// Unicode is regularly updated. Many USVs are not currently assigned to a -/// character, but may be in the future ("reserved"); some will never be a character -/// ("noncharacters"); and some may be given different meanings by different users -/// ("private use"). +/// USVs are also the exact set of values that may be encoded in UTF-8. Because +/// `char` values are USVs and `str` values are valid UTF-8, it is safe to store +/// any `char` in a `str` or read any character from a `str` as a `char`. +/// +/// The gap in valid `char` values is understood by the compiler, so in the +/// below example the two ranges are understood to cover the whole range of +/// possible `char` values and there is no error for a [non-exhaustive match]. +/// +/// ``` +/// let c: char = 'a'; +/// match c { +/// '\0' ..= '\u{D7FF}' => false, +/// '\u{E000}' ..= '\u{10FFFF}' => true, +/// }; +/// ``` +/// +/// All USVs are valid `char` values, but not all of them represent a real +/// character. Many USVs are not currently assigned to a character, but may be +/// in the future ("reserved"); some will never be a character +/// ("noncharacters"); and some may be given different meanings by different +/// users ("private use"). /// -/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value /// [Unicode code point]: https://www.unicode.org/glossary/#code_point +/// [Unicode scalar value]: https://www.unicode.org/glossary/#unicode_scalar_value +/// [non-exhaustive match]: ../book/ch06-02-match.html#matches-are-exhaustive /// [surrogate code point]: https://www.unicode.org/glossary/#surrogate_code_point /// /// # Representation From d372baf3f9a6401fdd4f74e3385ba553667b287d Mon Sep 17 00:00:00 2001 From: George Bateman Date: Tue, 1 Feb 2022 21:44:53 +0000 Subject: [PATCH 3/3] Fix annotation of code blocks --- library/core/src/primitive_docs.rs | 5 +++-- library/std/src/primitive_docs.rs | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/library/core/src/primitive_docs.rs b/library/core/src/primitive_docs.rs index b9d71178921f7..ebb1d8971b99d 100644 --- a/library/core/src/primitive_docs.rs +++ b/library/core/src/primitive_docs.rs @@ -275,6 +275,7 @@ mod prim_bool {} mod prim_never {} #[doc(primitive = "char")] +#[allow(rustdoc::invalid_rust_codeblocks)] /// A character type. /// /// The `char` type represents a single character. More specifically, since @@ -295,7 +296,7 @@ mod prim_never {} /// No `char` may be constructed, whether as a literal or at runtime, that is not a /// Unicode scalar value: /// -/// ```text +/// ```compile_fail /// // Each of these is a compiler error /// ['\u{D800}', '\u{DFFF}', '\u{110000}']; /// ``` @@ -305,7 +306,7 @@ mod prim_never {} /// char::from_u32(0xDE01).unwrap(); /// ``` /// -/// ``` +/// ```no_run /// // Undefined behaviour /// unsafe { char::from_u32_unchecked(0x110000) }; /// ``` diff --git a/library/std/src/primitive_docs.rs b/library/std/src/primitive_docs.rs index b9d71178921f7..ebb1d8971b99d 100644 --- a/library/std/src/primitive_docs.rs +++ b/library/std/src/primitive_docs.rs @@ -275,6 +275,7 @@ mod prim_bool {} mod prim_never {} #[doc(primitive = "char")] +#[allow(rustdoc::invalid_rust_codeblocks)] /// A character type. /// /// The `char` type represents a single character. More specifically, since @@ -295,7 +296,7 @@ mod prim_never {} /// No `char` may be constructed, whether as a literal or at runtime, that is not a /// Unicode scalar value: /// -/// ```text +/// ```compile_fail /// // Each of these is a compiler error /// ['\u{D800}', '\u{DFFF}', '\u{110000}']; /// ``` @@ -305,7 +306,7 @@ mod prim_never {} /// char::from_u32(0xDE01).unwrap(); /// ``` /// -/// ``` +/// ```no_run /// // Undefined behaviour /// unsafe { char::from_u32_unchecked(0x110000) }; /// ```