From 8c7e107f1b44d4e1153e6249d06f911e6ace2caf Mon Sep 17 00:00:00 2001 From: Jade Date: Tue, 27 Apr 2021 23:26:28 -0700 Subject: [PATCH] Make str::from_utf8 const fn However: I am not at all confident in the soundness of this implementation. Everything done here is defined but: ``` // casting pointers to ints is unsafe in const fn because the const evaluator cannot // possibly know what the result of various operations like `address / 2` would be // pointers during const evaluation have no integral address, only an abstract one ``` And I don't know if an "abstract" address can be &'d. --- library/core/src/lib.rs | 3 +++ library/core/src/str/converts.rs | 31 ++++++++++++++++++++--------- library/core/src/str/validations.rs | 12 +++++++---- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index 0e2c140c367a9..df451ccdcb437 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -95,6 +95,7 @@ #![feature(const_ptr_write)] #![feature(const_raw_ptr_comparison)] #![feature(const_raw_ptr_deref)] +#![feature(const_raw_ptr_to_usize_cast)] #![feature(const_slice_from_raw_parts)] #![feature(const_slice_ptr_len)] #![feature(const_size_of_val)] @@ -103,9 +104,11 @@ #![feature(const_type_id)] #![feature(const_type_name)] #![feature(const_likely)] +#![feature(const_str_from_utf8_unchecked)] #![feature(const_unreachable_unchecked)] #![feature(const_maybe_uninit_assume_init)] #![feature(const_maybe_uninit_as_ptr)] +#![feature(str_internals)] #![feature(custom_inner_attributes)] #![feature(decl_macro)] #![feature(doc_cfg)] diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs index 05ff7bb120dae..d282c823e84b8 100644 --- a/library/core/src/str/converts.rs +++ b/library/core/src/str/converts.rs @@ -82,10 +82,16 @@ use super::Utf8Error; /// assert_eq!("💖", sparkle_heart); /// ``` #[stable(feature = "rust1", since = "1.0.0")] -pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked(v) }) +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +#[rustc_allow_const_fn_unstable(str_internals)] +#[rustc_allow_const_fn_unstable(const_str_from_utf8_unchecked)] +pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { + // ? is not available in const + match run_utf8_validation(v) { + // SAFETY: Just ran validation. + Ok(_) => Ok(unsafe { from_utf8_unchecked(v) }), + Err(e) => Err(e), + } } /// Converts a mutable slice of bytes to a mutable string slice. @@ -119,10 +125,16 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { /// See the docs for [`Utf8Error`] for more details on the kinds of /// errors that can be returned. #[stable(feature = "str_mut_extras", since = "1.20.0")] -pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked_mut(v) }) +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +#[rustc_allow_const_fn_unstable(str_internals)] +#[rustc_allow_const_fn_unstable(const_str_from_utf8_unchecked)] +pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { + // ? is not available in const + match run_utf8_validation(v) { + // SAFETY: Just ran validation. + Ok(_) => Ok(unsafe { from_utf8_unchecked_mut(v) }), + Err(e) => Err(e), + } } /// Converts a slice of bytes to a string slice without checking @@ -183,7 +195,8 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { /// ``` #[inline] #[stable(feature = "str_mut_extras", since = "1.20.0")] -pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { +#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked", issue = "75196")] +pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { // SAFETY: the caller must guarantee that the bytes `v` // are valid UTF-8, thus the cast to `*mut str` is safe. // Also, the pointer dereference is safe because that pointer diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 373a8212425ac..6626d0c69b105 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -105,21 +105,25 @@ const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; /// Returns `true` if any byte in the word `x` is nonascii (>= 128). #[inline] -fn contains_nonascii(x: usize) -> bool { +const fn contains_nonascii(x: usize) -> bool { (x & NONASCII_MASK) != 0 } /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. #[inline(always)] -pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let mut index = 0; let len = v.len(); let usize_bytes = mem::size_of::(); let ascii_block_size = 2 * usize_bytes; let blocks_end = if len >= ascii_block_size { len - ascii_block_size + 1 } else { 0 }; - let align = v.as_ptr().align_offset(usize_bytes); + // FIXME(lf-) align_offset is not const fn yet, so we do it manually + let mask = usize_bytes - 1; + // SAFETY: uh help pls + let align = (usize_bytes - (unsafe { v.as_ptr() as usize } & mask)) & mask; while index < len { let old_offset = index; @@ -230,7 +234,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { } // https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8; 256] = [ +const UTF8_CHAR_WIDTH: [u8; 256] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,