Skip to content

[Clang] [C23] Implement N2653: u8 strings are char8_t[] #97208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,12 @@ C23 Feature Support
- Added the ``FLT_NORM_MAX``, ``DBL_NORM_MAX``, and ``LDBL_NORM_MAX`` to the
freestanding implementation of ``<float.h>`` that ships with Clang.

- Compiler support for `N2653 char8_t: A type for UTF-8 characters and strings`
<https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm>`_: ``u8`` string
literals are now of type ``char8_t[N]`` in C23 and expose
``__CLANG_ATOMIC_CHAR8_T_LOCK_FREE``/``__GCC_ATOMIC_CHAR8_T_LOCK_FREE`` to
implement the corresponding macro in ``<stdatomic.h>``.

Non-comprehensive list of changes in this release
-------------------------------------------------

Expand Down
5 changes: 4 additions & 1 deletion clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -7249,7 +7249,10 @@ def err_array_init_utf8_string_into_char : Error<
def warn_cxx20_compat_utf8_string : Warning<
"type of UTF-8 string literal will change from array of const char to "
"array of const char8_t in C++20">, InGroup<CXX20Compat>, DefaultIgnore;
def note_cxx20_compat_utf8_string_remove_u8 : Note<
def warn_c23_compat_utf8_string : Warning<
"type of UTF-8 string literal will change from array of char to "
"array of char8_t in C23">, InGroup<C23Compat>, DefaultIgnore;
def note_cxx20_c23_compat_utf8_string_remove_u8 : Note<
"remove 'u8' prefix to avoid a change of behavior; "
"Clang encodes unprefixed narrow string literals as UTF-8">;
def err_array_init_different_type : Error<
Expand Down
8 changes: 6 additions & 2 deletions clang/lib/Frontend/InitPreprocessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1170,6 +1170,8 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder);
DefineType("__WINT_TYPE__", TI.getWIntType(), Builder);
DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder);
if (LangOpts.C23)
DefineType("__CHAR8_TYPE__", TI.UnsignedChar, Builder);
DefineType("__CHAR16_TYPE__", TI.getChar16Type(), Builder);
DefineType("__CHAR32_TYPE__", TI.getChar32Type(), Builder);

Expand Down Expand Up @@ -1349,8 +1351,10 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
getLockFreeValue(TI.get##Type##Width(), TI));
DEFINE_LOCK_FREE_MACRO(BOOL, Bool);
DEFINE_LOCK_FREE_MACRO(CHAR, Char);
if (LangOpts.Char8)
DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char); // Treat char8_t like char.
// char8_t has the same representation / width as unsigned
// char in C++ and is a typedef for unsigned char in C23
if (LangOpts.Char8 || LangOpts.C23)
DEFINE_LOCK_FREE_MACRO(CHAR8_T, Char);
DEFINE_LOCK_FREE_MACRO(CHAR16_T, Char16);
DEFINE_LOCK_FREE_MACRO(CHAR32_T, Char32);
DEFINE_LOCK_FREE_MACRO(WCHAR_T, WChar);
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Headers/stdatomic.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ extern "C" {

#define ATOMIC_BOOL_LOCK_FREE __CLANG_ATOMIC_BOOL_LOCK_FREE
#define ATOMIC_CHAR_LOCK_FREE __CLANG_ATOMIC_CHAR_LOCK_FREE
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
#define ATOMIC_CHAR8_T_LOCK_FREE __CLANG_ATOMIC_CHAR8_T_LOCK_FREE
#endif
#define ATOMIC_CHAR16_T_LOCK_FREE __CLANG_ATOMIC_CHAR16_T_LOCK_FREE
#define ATOMIC_CHAR32_T_LOCK_FREE __CLANG_ATOMIC_CHAR32_T_LOCK_FREE
#define ATOMIC_WCHAR_T_LOCK_FREE __CLANG_ATOMIC_WCHAR_T_LOCK_FREE
Expand Down Expand Up @@ -104,6 +107,9 @@ typedef _Atomic(long) atomic_long;
typedef _Atomic(unsigned long) atomic_ulong;
typedef _Atomic(long long) atomic_llong;
typedef _Atomic(unsigned long long) atomic_ullong;
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
typedef _Atomic(unsigned char) atomic_char8_t;
#endif
typedef _Atomic(uint_least16_t) atomic_char16_t;
typedef _Atomic(uint_least32_t) atomic_char32_t;
typedef _Atomic(wchar_t) atomic_wchar_t;
Expand Down
22 changes: 15 additions & 7 deletions clang/lib/Sema/SemaExpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2051,6 +2051,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
} else if (Literal.isUTF8()) {
if (getLangOpts().Char8)
CharTy = Context.Char8Ty;
else if (getLangOpts().C23)
CharTy = Context.UnsignedCharTy;
Kind = StringLiteralKind::UTF8;
} else if (Literal.isUTF16()) {
CharTy = Context.Char16Ty;
Expand All @@ -2062,17 +2064,23 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
CharTy = Context.UnsignedCharTy;
}

// Warn on initializing an array of char from a u8 string literal; this
// becomes ill-formed in C++2a.
if (getLangOpts().CPlusPlus && !getLangOpts().CPlusPlus20 &&
!getLangOpts().Char8 && Kind == StringLiteralKind::UTF8) {
Diag(StringTokLocs.front(), diag::warn_cxx20_compat_utf8_string);
// Warn on u8 string literals before C++20 and C23, whose type
// was an array of char before but becomes an array of char8_t.
// In C++20, it cannot be used where a pointer to char is expected.
// In C23, it might have an unexpected value if char was signed.
if (Kind == StringLiteralKind::UTF8 &&
(getLangOpts().CPlusPlus
? !getLangOpts().CPlusPlus20 && !getLangOpts().Char8
: !getLangOpts().C23)) {
Diag(StringTokLocs.front(), getLangOpts().CPlusPlus
? diag::warn_cxx20_compat_utf8_string
: diag::warn_c23_compat_utf8_string);

// Create removals for all 'u8' prefixes in the string literal(s). This
// ensures C++2a compatibility (but may change the program behavior when
// ensures C++20/C23 compatibility (but may change the program behavior when
// built by non-Clang compilers for which the execution character set is
// not always UTF-8).
auto RemovalDiag = PDiag(diag::note_cxx20_compat_utf8_string_remove_u8);
auto RemovalDiag = PDiag(diag::note_cxx20_c23_compat_utf8_string_remove_u8);
SourceLocation RemovalDiagLoc;
for (const Token &Tok : StringToks) {
if (Tok.getKind() == tok::utf8_string_literal) {
Expand Down
34 changes: 34 additions & 0 deletions clang/test/C/C23/n2653.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// RUN: %clang_cc1 -ffreestanding -verify=c23 -std=c23 %s
// RUN: %clang_cc1 -ffreestanding -verify=c17 -std=c17 %s

// c23-no-diagnostics

#include <stdatomic.h>

#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
#define __is_same(a, b) (__extension__ _Generic(a, b: 1, default: 0) && __extension__ _Generic(b, a: 1, default: 0))

#ifndef ATOMIC_CHAR8_T_LOCK_FREE
#error missing
#endif
// c17-error@-2 {{missing}}

_Static_assert(__is_same(atomic_char8_t, unsigned char _Atomic), "");
// c17-error@-1 {{use of undeclared identifier 'atomic_char8_t'}}
// c17-error@-2 {{unknown type name 'atomic_char8_t'}}

_Static_assert(_Generic(u8"", unsigned char*: 1, char*: 0), "");
// c17-error@-1 {{static assertion failed}}

// -fsigned-char is the default
#define M(X) __enable_constant_folding((X) >= 0x80)

_Static_assert(M(u8"\U000000E9"[0]), "");
// c17-error@-1 {{static assertion failed}}
#if __STDC_VERSION__ >= 202311L
_Static_assert(M(u8'\xC3'), "");
#endif

const char cu8[] = u8"text";
const signed char scu8[] = u8"text";
const unsigned char ucu8[] = u8"text";
2 changes: 1 addition & 1 deletion clang/www/c_status.html
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,7 @@ <h2 id="c2x">C23 implementation status</h2>
<tr>
<td>char8_t: A type for UTF-8 characters and strings</td>
<td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2653.htm">N2653</a></td>
<td class="none" align="center">No</td>
<td class="unreleased" align="center">Clang 19</td>
</tr>
<tr>
<td>Clarification for max exponent macros-update</td>
Expand Down
Loading