Skip to content

Commit 0b4efa4

Browse files
MitalAshokyuxuanchen1997
authored andcommitted
[Clang] [C23] Implement N2653: u8 strings are char8_t[] (#97208)
Summary: https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm Closes #97202 --------- Co-authored-by: cor3ntin <[email protected]> Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250894
1 parent 5bdc50d commit 0b4efa4

File tree

7 files changed

+72
-11
lines changed

7 files changed

+72
-11
lines changed

clang/docs/ReleaseNotes.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,12 @@ C23 Feature Support
362362
- Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the
363363
freestanding implementation of ``<float.h>`` that ships with Clang.
364364

365+
- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
366+
<https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
367+
literals are now of type ``char8_t[N]`` in C23 and expose
368+
``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
369+
implement the corresponding macro in ``<stdatomic.h>``.
370+
365371
Non-comprehensive list of changes in this release
366372
-------------------------------------------------
367373

clang/include/clang/Basic/DiagnosticSemaKinds.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7249,7 +7249,10 @@ def err_array_init_utf8_string_into_char : Error<
72497249
def warn_cxx20_compat_utf8_string : Warning<
72507250
"type of UTF-8 string literal will change from array of const char to "
72517251
"array of const char8_t in C++20">, InGroup<CXX20Compat>, DefaultIgnore;
7252-
def note_cxx20_compat_utf8_string_remove_u8 : Note<
7252+
def warn_c23_compat_utf8_string : Warning<
7253+
"type of UTF-8 string literal will change from array of char to "
7254+
"array of char8_t in C23">, InGroup<C23Compat>, DefaultIgnore;
7255+
def note_cxx20_c23_compat_utf8_string_remove_u8 : Note<
72537256
"remove 'u8' prefix to avoid a change of behavior; "
72547257
"Clang encodes unprefixed narrow string literals as UTF-8">;
72557258
def err_array_init_different_type : Error<

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,6 +1170,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
11701170
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
11711171
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
11721172
DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
1173+
if (LangOpts.C23)
1174+
DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder);
11731175
DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
11741176
DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);
11751177

@@ -1349,8 +1351,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
13491351
getLockFreeValue(TI.get##Type##Width(), TI));
13501352
DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
13511353
DEFINE_LOCK_FREE_MACRO(CHAR, Char);
1352-
if (LangOpts.Char8)
1353-
DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char.
1354+
// char8_t has the same representation / width as unsigned
1355+
// char in C++ and is a typedef for unsigned char in C23
1356+
if (LangOpts.Char8 || LangOpts.C23)
1357+
DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char);
13541358
DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
13551359
DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
13561360
DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);

clang/lib/Headers/stdatomic.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ extern "C" {
3535

3636
#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE
3737
#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
38+
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
39+
#define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
40+
#endif
3841
#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
3942
#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
4043
#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
@@ -104,6 +107,9 @@ typedef _Atomic(long) atomic_long;
104107
typedef _Atomic(unsigned long) atomic_ulong;
105108
typedef _Atomic(long long) atomic_llong;
106109
typedef _Atomic(unsigned long long) atomic_ullong;
110+
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
111+
typedef _Atomic(unsigned char) atomic_char8_t;
112+
#endif
107113
typedef _Atomic(uint_least16_t) atomic_char16_t;
108114
typedef _Atomic(uint_least32_t) atomic_char32_t;
109115
typedef _Atomic(wchar_t) atomic_wchar_t;

clang/lib/Sema/SemaExpr.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2051,6 +2051,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
20512051
} else if (Literal.isUTF8()) {
20522052
if (getLangOpts().Char8)
20532053
CharTy = Context.Char8Ty;
2054+
else if (getLangOpts().C23)
2055+
CharTy = Context.UnsignedCharTy;
20542056
Kind = StringLiteralKind::UTF8;
20552057
} else if (Literal.isUTF16()) {
20562058
CharTy = Context.Char16Ty;
@@ -2062,17 +2064,23 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
20622064
CharTy = Context.UnsignedCharTy;
20632065
}
20642066

2065-
// Warn on initializing an array of char from a u8 string literal; this
2066-
// becomes ill-formed in C++2a.
2067-
if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 &&
2068-
!getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) {
2069-
Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string);
2067+
// Warn on u8 string literals before C++20 and C23, whose type
2068+
// was an array of char before but becomes an array of char8_t.
2069+
// In C++20, it cannot be used where a pointer to char is expected.
2070+
// In C23, it might have an unexpected value if char was signed.
2071+
if (Kind == StringLiteralKind::UTF8 &&
2072+
(getLangOpts().CPlusPlus
2073+
? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8
2074+
: !getLangOpts().C23)) {
2075+
Diag(StringTokLocs.front(), getLangOpts().CPlusPlus
2076+
? diag::warn_cxx20_compat_utf8_string
2077+
: diag::warn_c23_compat_utf8_string);
20702078

20712079
// Create removals for all 'u8' prefixes in the string literal(s). This
2072-
// ensures C++2a compatibility (but may change the program behavior when
2080+
// ensures C++20/C23 compatibility (but may change the program behavior when
20732081
// built by non-Clang compilers for which the execution character set is
20742082
// not always UTF-8).
2075-
auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8);
2083+
auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8);
20762084
SourceLocation RemovalDiagLoc;
20772085
for (const Token &Tok : StringToks) {
20782086
if (Tok.getKind() == tok::utf8_string_literal) {

clang/test/C/C23/n2653.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
// RUN: %clang_cc1 -ffreestanding -verify=c23 -std=c23 %s
2+
// RUN: %clang_cc1 -ffreestanding -verify=c17 -std=c17 %s
3+
4+
// c23-no-diagnostics
5+
6+
#include <stdatomic.h>
7+
8+
#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
9+
#define __is_same(a, b) (__extension__ _Generic(a, b: 1, default: 0) && __extension__ _Generic(b, a: 1, default: 0))
10+
11+
#ifndef ATOMIC_CHAR8_T_LOCK_FREE
12+
#error missing
13+
#endif
14+
// c17-error@-2 {{missing}}
15+
16+
_Static_assert(__is_same(atomic_char8_t, unsigned char _Atomic), "");
17+
// c17-error@-1 {{use of undeclared identifier 'atomic_char8_t'}}
18+
// c17-error@-2 {{unknown type name 'atomic_char8_t'}}
19+
20+
_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), "");
21+
// c17-error@-1 {{static assertion failed}}
22+
23+
// -fsigned-char is the default
24+
#define M(X) __enable_constant_folding((X) >= 0x80)
25+
26+
_Static_assert(M(u8"\U000000E9"[0]), "");
27+
// c17-error@-1 {{static assertion failed}}
28+
#if __STDC_VERSION__ >= 202311L
29+
_Static_assert(M(u8'\xC3'), "");
30+
#endif
31+
32+
const char cu8[] = u8"text";
33+
const signed char scu8[] = u8"text";
34+
const unsigned char ucu8[] = u8"text";

clang/www/c_status.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,7 @@ <h2 id="c2x">C23 implementation status</h2>
10661066
<tr>
10671067
<td>char8_t: A type for UTF-8 characters and strings</td>
10681068
<td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm">N2653</a></td>
1069-
<td class="none" align="center">No</td>
1069+
<td class="unreleased" align="center">Clang 19</td>
10701070
</tr>
10711071
<tr>
10721072
<td>Clarification for max exponent macros-update</td>

0 commit comments

Comments
 (0)