uefi: Add new cstr8 implementation

nicholasbishop · nicholasbishop · commit bdb290c0aab1 · 2024-04-22T01:50:55.000-04:00
Implement cstr8 as a proc macro, similar to cstr16.
diff --git a/uefi/src/data_types/mod.rs b/uefi/src/data_types/mod.rs
@@ -151,6 +151,10 @@ pub use strs::{
     CStr16, CStr8, EqStrUntilNul, FromSliceWithNulError, FromStrWithBufError, UnalignedCStr16Error,
 };
 
+/// These functions are used in the implementation of the [`cstr8`] macro.
+#[doc(hidden)]
+pub use strs::{str_num_latin1_chars, str_to_latin1};
+
 #[cfg(feature = "alloc")]
 mod owned_strs;
 #[cfg(feature = "alloc")]
diff --git a/uefi/src/data_types/strs.rs b/uefi/src/data_types/strs.rs
@@ -221,6 +221,91 @@ impl<'a> TryFrom<&'a CStr> for &'a CStr8 {
     }
 }
 
+/// Get a Latin-1 character from a UTF-8 byte slice at the given offset.
+///
+/// Returns a pair containing the Latin-1 character and the number of bytes in
+/// the UTF-8 encoding of that character.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// # Safety
+///
+/// The input `bytes` must be valid UTF-8.
+const unsafe fn latin1_from_utf8_at_offset(bytes: &[u8], offset: usize) -> (u8, usize) {
+    if bytes[offset] & 0b1000_0000 == 0b0000_0000 {
+        (bytes[offset] as u8, 1)
+    } else if bytes[offset] & 0b1110_0000 == 0b1100_0000 {
+        let a = (bytes[offset] & 0b0001_1111) as u16;
+        let b = (bytes[offset + 1] & 0b0011_1111) as u16;
+        let ch = a << 6 | b;
+        if ch > 0xff {
+            panic!("input string cannot be encoded as Latin-1");
+        }
+        (ch as u8, 2)
+    } else {
+        // Latin-1 code points only go up to 0xff, so if the input contains any
+        // UTF-8 characters larger than two bytes it cannot be converted to
+        // Latin-1.
+        panic!("input string cannot be encoded as Latin-1");
+    }
+}
+
+/// Count the number of Latin-1 characters in a string.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// This is public but hidden; it is used in the `cstr8` macro.
+pub const fn str_num_latin1_chars(s: &str) -> usize {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut offset = 0;
+    let mut num_latin1_chars = 0;
+
+    while offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        let (_, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, offset) };
+        offset += num_utf8_bytes as usize;
+        num_latin1_chars += 1;
+    }
+
+    num_latin1_chars
+}
+
+/// Convert a `str` into a null-terminated Latin-1 character array.
+///
+/// Panics if the string cannot be encoded in Latin-1.
+///
+/// This is public but hidden; it is used in the `cstr8` macro.
+pub const fn str_to_latin1<const N: usize>(s: &str) -> [u8; N] {
+    let bytes = s.as_bytes();
+    let len = bytes.len();
+
+    let mut output = [0; N];
+
+    let mut output_offset = 0;
+    let mut input_offset = 0;
+    while input_offset < len {
+        // SAFETY: `bytes` is valid UTF-8.
+        let (ch, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset(bytes, input_offset) };
+        if ch == 0 {
+            panic!("interior null character");
+        } else {
+            output[output_offset] = ch;
+            output_offset += 1;
+            input_offset += num_utf8_bytes;
+        }
+    }
+
+    // The output array must be one bigger than the converted string,
+    // to leave room for the trailing null character.
+    if output_offset + 1 != N {
+        panic!("incorrect array length");
+    }
+
+    output
+}
+
 /// An UCS-2 null-terminated string slice.
 ///
 /// This type is largely inspired by [`core::ffi::CStr`] with the exception that all characters are
diff --git a/uefi/src/lib.rs b/uefi/src/lib.rs
@@ -113,7 +113,7 @@ pub mod data_types;
 #[cfg(feature = "alloc")]
 pub use data_types::CString16;
 pub use data_types::{CStr16, CStr8, Char16, Char8, Event, Guid, Handle, Identify};
-pub use uefi_macros::{cstr8, entry};
+pub use uefi_macros::entry;
 pub use uguid::guid;
 
 mod result;
@@ -140,17 +140,3 @@ pub mod helpers;
 
 mod macros;
 mod util;
-
-#[cfg(test)]
-// Crates that create procedural macros can't unit test the macros they export.
-// Therefore, we do some tests here.
-mod macro_tests {
-    use crate::cstr8;
-
-    #[test]
-    fn cstr8_macro_literal() {
-        let _empty1 = cstr8!();
-        let _empty2 = cstr8!("");
-        let _regular = cstr8!("foobar");
-    }
-}
diff --git a/uefi/src/macros.rs b/uefi/src/macros.rs
@@ -1,3 +1,47 @@
+/// Encode a string literal as a [`&CStr8`].
+///
+/// The encoding is done at compile time, so the result can be used in a
+/// `const` item.
+///
+/// An empty string containing just a null character can be created with either
+/// `cstr8!()` or `cstr8!("")`.
+///
+/// # Example
+///
+/// ```
+/// use uefi::{CStr8, cstr8};
+///
+/// const S: &CStr8 = cstr8!("abÿ");
+/// assert_eq!(S.as_bytes(), [97, 98, 255, 0]);
+///
+/// const EMPTY: &CStr8 = cstr8!();
+/// assert_eq!(EMPTY.as_bytes(), [0]);
+/// assert_eq!(cstr8!(""), EMPTY);
+/// ```
+///
+/// [`&CStr8`]: crate::CStr8
+#[macro_export]
+macro_rules! cstr8 {
+    () => {{
+        const S: &[u8] = &[0];
+        // SAFETY: `S` is a trivially correct Latin-1 C string.
+        unsafe { $crate::CStr8::from_bytes_with_nul_unchecked(S) }
+    }};
+    ($s:literal) => {{
+        // Use `const` values here to force errors to happen at compile
+        // time.
+
+        // Add one for the null char.
+        const NUM_CHARS: usize = $crate::data_types::str_num_latin1_chars($s) + 1;
+
+        const VAL: [u8; NUM_CHARS] = $crate::data_types::str_to_latin1($s);
+
+        // SAFETY: the `str_to_latin1` function always produces a valid Latin-1
+        // string with a trailing null character.
+        unsafe { $crate::CStr8::from_bytes_with_nul_unchecked(&VAL) }
+    }};
+}
+
 /// Encode a string literal as a [`&CStr16`].
 ///
 /// The encoding is done at compile time, so the result can be used in a