@@ -221,6 +221,93 @@ impl<'a> TryFrom<&'a CStr> for &'a CStr8 {
221
221
}
222
222
}
223
223
224
+ /// Get a Latin-1 character from a UTF-8 byte slice at the given offset.
225
+ ///
226
+ /// Returns a pair containing the Latin-1 character and the number of bytes in
227
+ /// the UTF-8 encoding of that character.
228
+ ///
229
+ /// Panics if the string cannot be encoded in Latin-1.
230
+ ///
231
+ /// # Safety
232
+ ///
233
+ /// The input `bytes` must be valid UTF-8.
234
+ const unsafe fn latin1_from_utf8_at_offset ( bytes : & [ u8 ] , offset : usize ) -> ( u8 , usize ) {
235
+ if bytes[ offset] & 0b1000_0000 == 0b0000_0000 {
236
+ ( bytes[ offset] , 1 )
237
+ } else if bytes[ offset] & 0b1110_0000 == 0b1100_0000 {
238
+ let a = ( bytes[ offset] & 0b0001_1111 ) as u16 ;
239
+ let b = ( bytes[ offset + 1 ] & 0b0011_1111 ) as u16 ;
240
+ let ch = a << 6 | b;
241
+ if ch > 0xff {
242
+ panic ! ( "input string cannot be encoded as Latin-1" ) ;
243
+ }
244
+ ( ch as u8 , 2 )
245
+ } else {
246
+ // Latin-1 code points only go up to 0xff, so if the input contains any
247
+ // UTF-8 characters larger than two bytes it cannot be converted to
248
+ // Latin-1.
249
+ panic ! ( "input string cannot be encoded as Latin-1" ) ;
250
+ }
251
+ }
252
+
253
+ /// Count the number of Latin-1 characters in a string.
254
+ ///
255
+ /// Panics if the string cannot be encoded in Latin-1.
256
+ ///
257
+ /// This is public but hidden; it is used in the `cstr8` macro.
258
+ #[ must_use]
259
+ pub const fn str_num_latin1_chars ( s : & str ) -> usize {
260
+ let bytes = s. as_bytes ( ) ;
261
+ let len = bytes. len ( ) ;
262
+
263
+ let mut offset = 0 ;
264
+ let mut num_latin1_chars = 0 ;
265
+
266
+ while offset < len {
267
+ // SAFETY: `bytes` is valid UTF-8.
268
+ let ( _, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset ( bytes, offset) } ;
269
+ offset += num_utf8_bytes;
270
+ num_latin1_chars += 1 ;
271
+ }
272
+
273
+ num_latin1_chars
274
+ }
275
+
276
+ /// Convert a `str` into a null-terminated Latin-1 character array.
277
+ ///
278
+ /// Panics if the string cannot be encoded in Latin-1.
279
+ ///
280
+ /// This is public but hidden; it is used in the `cstr8` macro.
281
+ #[ must_use]
282
+ pub const fn str_to_latin1 < const N : usize > ( s : & str ) -> [ u8 ; N ] {
283
+ let bytes = s. as_bytes ( ) ;
284
+ let len = bytes. len ( ) ;
285
+
286
+ let mut output = [ 0 ; N ] ;
287
+
288
+ let mut output_offset = 0 ;
289
+ let mut input_offset = 0 ;
290
+ while input_offset < len {
291
+ // SAFETY: `bytes` is valid UTF-8.
292
+ let ( ch, num_utf8_bytes) = unsafe { latin1_from_utf8_at_offset ( bytes, input_offset) } ;
293
+ if ch == 0 {
294
+ panic ! ( "interior null character" ) ;
295
+ } else {
296
+ output[ output_offset] = ch;
297
+ output_offset += 1 ;
298
+ input_offset += num_utf8_bytes;
299
+ }
300
+ }
301
+
302
+ // The output array must be one bigger than the converted string,
303
+ // to leave room for the trailing null character.
304
+ if output_offset + 1 != N {
305
+ panic ! ( "incorrect array length" ) ;
306
+ }
307
+
308
+ output
309
+ }
310
+
224
311
/// An UCS-2 null-terminated string slice.
225
312
///
226
313
/// This type is largely inspired by [`core::ffi::CStr`] with the exception that all characters are
0 commit comments