Merge pull request #1151 from nicholasbishop/bishop-rework-str-macros

phip1611 · web-flow · commit 65aae97bd823 · 2024-04-22T07:25:51.000Z
Replace `cstr8!` with a declarative macro
diff --git a/uefi-macros/CHANGELOG.md b/uefi-macros/CHANGELOG.md
@@ -1,8 +1,8 @@
 # uefi-macros - [Unreleased]
 
 ## Removed
-- Removed the `cstr16` macro. Use the `cstr16` declarative macro exported by the
-  `uefi` crate instead.
+- Removed the `cstr8` and `cstr16` macros. Use the declarative macros of the
+  same names exported by the `uefi` crate as a replacement.
 
 # uefi-macros - 0.13.0 (2023-11-12)
 
diff --git a/uefi-macros/src/lib.rs b/uefi-macros/src/lib.rs
@@ -9,7 +9,7 @@ use quote::{quote, quote_spanned, TokenStreamExt};
 use syn::spanned::Spanned;
 use syn::{
     parse_macro_input, parse_quote, Error, Expr, ExprLit, ExprPath, FnArg, Ident, ItemFn,
-    ItemStruct, Lit, LitStr, Pat, Visibility,
+    ItemStruct, Lit, Pat, Visibility,
 };
 
 macro_rules! err {
@@ -247,44 +247,3 @@ pub fn entry(args: TokenStream, input: TokenStream) -> TokenStream {
     };
     result.into()
 }
-
-/// Builds a `CStr8` literal at compile time from a string literal.
-///
-/// This will throw a compile error if an invalid character is in the passed string.
-///
-/// # Example
-/// ```
-/// # use uefi_macros::cstr8;
-/// // Empty string
-/// assert_eq!(cstr8!().to_u16_slice_with_nul(), [0]);
-/// assert_eq!(cstr8!("").to_u16_slice_with_nul(), [0]);
-/// // Non-empty string
-/// assert_eq!(cstr8!("test").as_bytes(), [116, 101, 115, 116, 0]);
-/// ```
-#[proc_macro]
-pub fn cstr8(input: proc_macro::TokenStream) -> proc_macro::TokenStream {
-    // Accept empty input.
-    if input.is_empty() {
-        return quote!(unsafe { ::uefi::CStr16::from_u16_with_nul_unchecked(&[0]) }).into();
-    }
-    let input: LitStr = parse_macro_input!(input);
-    let input = input.value();
-    // Accept "" input.
-    if input.is_empty() {
-        return quote!(unsafe { ::uefi::CStr16::from_u16_with_nul_unchecked(&[0]) }).into();
-    }
-
-    // Accept any non-empty string input.
-    match input
-        .chars()
-        .map(u8::try_from)
-        .collect::<Result<Vec<u8>, _>>()
-    {
-        Ok(c) => {
-            quote!(unsafe { ::uefi::CStr8::from_bytes_with_nul_unchecked(&[ #(#c),* , 0 ]) }).into()
-        }
-        Err(_) => syn::Error::new_spanned(input, "invalid character in string")
-            .into_compile_error()
-            .into(),
-    }
-}
diff --git a/uefi/src/data_types/mod.rs b/uefi/src/data_types/mod.rs
@@ -151,6 +151,10 @@ pub use strs::{
     CStr16, CStr8, EqStrUntilNul, FromSliceWithNulError, FromStrWithBufError, UnalignedCStr16Error,
 };
 
+/// These functions are used in the implementation of the [`cstr8`] macro.
+#[doc(hidden)]
+pub use strs::{str_num_latin1_chars, str_to_latin1};
+
 #[cfg(feature = "alloc")]
 mod owned_strs;
 #[cfg(feature = "alloc")]
diff --git a/uefi/src/data_types/strs.rs b/uefi/src/data_types/strs.rs
@@ -221,6 +221,93 @@ impl<'a> TryFrom<&'a CStr> for &'a CStr8 {
     }
 }
 
+/// Get a Latin-1 character from a UTF-8 byte slice at the given offset.
+///
+/// Returns a pair containing the Latin-1 character and the number of bytes in
+/// the UTF-8 encoding of that character.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// # Safety
+///
+/// The input `bytes` must be valid UTF-8.
+const unsafe fn latin1_from_utf8_at_offset(bytes: &[u8], offset: usize) -> (u8, usize) {
+    if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
+        (bytes[offset], 1)
+    } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
+        let a = (bytes[offset] & 0b0001_1111) as u16;
+        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
+        let ch = a << 6 | b;
+        if ch > 0xff {
+            panic!("input string cannot be encoded as Latin-1");
+        }
+        (ch as u8, 2)
+    } else {
+        // Latin-1 code points only go up to 0xff, so if the input contains any
+        // UTF-8 characters larger than two bytes it cannot be converted to
+        // Latin-1.
+        panic!("input string cannot be encoded as Latin-1");
+    }
+}
+
+/// Count the number of Latin-1 characters in a string.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// This is public but hidden; it is used in the `cstr8` macro.
+#[must_use]
+pub const fn str_num_latin1_chars(s: &str) -> usize {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut offset = 0;
+    let mut num_latin1_chars = 0;
+
+    while offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        let (_, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, offset) };
+        offset += num_utf8_bytes;
+        num_latin1_chars += 1;
+    }
+
+    num_latin1_chars
+}
+
+/// Convert a `str` into a null-terminated Latin-1 character array.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// This is public but hidden; it is used in the `cstr8` macro.
+#[must_use]
+pub const fn str_to_latin1<const N: usize>(s: &str) -> [u8; N] {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut output = [0; N];
+
+    let mut output_offset = 0;
+    let mut input_offset = 0;
+    while input_offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        let (ch, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, input_offset) };
+        if ch == 0 {
+            panic!("interior null character");
+        } else {
+            output[output_offset] = ch;
+            output_offset += 1;
+            input_offset += num_utf8_bytes;
+        }
+    }
+
+    // The output array must be one bigger than the converted string,
+    // to leave room for the trailing null character.
+    if output_offset + 1 != N {
+        panic!("incorrect array length");
+    }
+
+    output
+}
+
 /// An UCS-2 null-terminated string slice.
 ///
 /// This type is largely inspired by [`core::ffi::CStr`] with the exception that all characters are
diff --git a/uefi/src/lib.rs b/uefi/src/lib.rs
@@ -113,7 +113,7 @@ pub mod data_types;
 #[cfg(feature = "alloc")]
 pub use data_types::CString16;
 pub use data_types::{CStr16, CStr8, Char16, Char8, Event, Guid, Handle, Identify};
-pub use uefi_macros::{cstr8, entry};
+pub use uefi_macros::entry;
 pub use uguid::guid;
 
 mod result;
@@ -140,17 +140,3 @@ pub mod helpers;
 
 mod macros;
 mod util;
-
-#[cfg(test)]
-// Crates that create procedural macros can't unit test the macros they export.
-// Therefore, we do some tests here.
-mod macro_tests {
-    use crate::cstr8;
-
-    #[test]
-    fn cstr8_macro_literal() {
-        let _empty1 = cstr8!();
-        let _empty2 = cstr8!("");
-        let _regular = cstr8!("foobar");
-    }
-}
diff --git a/uefi/src/macros.rs b/uefi/src/macros.rs
@@ -1,3 +1,47 @@
+/// Encode a string literal as a [`&CStr8`].
+///
+/// The encoding is done at compile time, so the result can be used in a
+/// `const` item.
+///
+/// An empty string containing just a null character can be created with either
+/// `cstr8!()` or `cstr8!("")`.
+///
+/// # Example
+///
+/// ```
+/// use uefi::{CStr8, cstr8};
+///
+/// const S: &CStr8 = cstr8!("abÿ");
+/// assert_eq!(S.as_bytes(), [97, 98, 255, 0]);
+///
+/// const EMPTY: &CStr8 = cstr8!();
+/// assert_eq!(EMPTY.as_bytes(), [0]);
+/// assert_eq!(cstr8!(""), EMPTY);
+/// ```
+///
+/// [`&CStr8`]: crate::CStr8
+#[macro_export]
+macro_rules! cstr8 {
+    () => {{
+        const S: &[u8] = &[0];
+        // SAFETY: `S` is a trivially correct Latin-1 C string.
+        unsafe { $crate::CStr8::from_bytes_with_nul_unchecked(S) }
+    }};
+    ($s:literal) => {{
+        // Use `const` values here to force errors to happen at compile
+        // time.
+
+        // Add one for the null char.
+        const NUM_CHARS: usize = $crate::data_types::str_num_latin1_chars($s) + 1;
+
+        const VAL: [u8; NUM_CHARS] = $crate::data_types::str_to_latin1($s);
+
+        // SAFETY: the `str_to_latin1` function always produces a valid Latin-1
+        // string with a trailing null character.
+        unsafe { $crate::CStr8::from_bytes_with_nul_unchecked(&VAL) }
+    }};
+}
+
 /// Encode a string literal as a [`&CStr16`].
 ///
 /// The encoding is done at compile time, so the result can be used in a