From d9bd48277058ef602f6a0df069450380e5ab7603 Mon Sep 17 00:00:00 2001 From: Alisdair Meredith Date: Mon, 28 Oct 2024 09:22:26 -0400 Subject: [PATCH] [lex] Better specify whitespace characters This commit defines a grammar term for _whitespace-character_ and uses it consistently where the plain text term whitespace character is used. A whitespace character is defined as one of the five characters that are mentioned in the text closest to provifing a defifinition. The unicode character name is (mostly) consistently used to name these characters, and for consistency, similar changes were made to name unicode characters rather than render specified characters in code font throughout [lex]. The one exception is backslash, which is retained as-is to avoid making more issues for P2348. Note that this commit is not a replacement for P2348, merely a clearer statement of the existing specification without any normative changes. --- source/lex.tex | 87 +++++++++++++++++++++++++++-------------- source/preprocessor.tex | 8 ++-- 2 files changed, 62 insertions(+), 33 deletions(-) diff --git a/source/lex.tex b/source/lex.tex index 6f5bffac97..753f8fb625 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -110,9 +110,9 @@ \indextext{line splicing}% If the first translation character is \unicode{feff}{byte order mark}, it is deleted. -Each sequence of a backslash character (\textbackslash) +Each sequence of a \unicode{005c}{reverse solidus} character (\tcode{\textbackslash}) immediately followed by -zero or more whitespace characters other than new-line followed by +zero or more \grammarterm{whitespace-character}s followed by a new-line character is deleted, splicing physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last backslash on any physical source line shall be eligible for being part @@ -126,9 +126,13 @@ shall be processed as if an additional new-line character were appended to the file. -\item The source file is decomposed into preprocessing -tokens\iref{lex.pptoken} and sequences of whitespace characters -(including comments). A source file shall not end in a partial +\item +\indextext{whitespace}% +\indextext{comment}% +\indextext{token!preprocessing}% +The source file is decomposed into preprocessing tokens\iref{lex.pptoken} and +whitespace\iref{lex.whitespace}. +A source file shall not end in a partial preprocessing token or in a partial comment. \begin{footnote} A partial preprocessing @@ -140,10 +144,10 @@ would arise from a source file ending with an unclosed \tcode{/*} comment. \end{footnote} -Each comment\iref{lex.comment} is replaced by one space character. New-line characters are -retained. Whether each nonempty sequence of whitespace characters other -than new-line is retained or replaced by one space character is -unspecified. +Each comment\iref{lex.comment} is replaced by one \unicode{0020}{space} character. +New-line characters are retained. +Whether each nonempty sequence of \grammarterm{whitespace-character}s is +retained or replaced by one \unicode{0020}{space} character is unspecified. As characters from the source file are consumed to form the next preprocessing token (i.e., not being consumed as part of a comment or other forms of whitespace), @@ -181,7 +185,7 @@ \item Each preprocessing token is converted into a token\iref{lex.token}. -Whitespace characters separating tokens are no longer significant. +Whitespace separating tokens is no longer significant. The resulting tokens constitute a \defn{translation unit} and are syntactically and semantically analyzed as a \grammarterm{translation-unit}\iref{basic.link} and @@ -467,7 +471,34 @@ None of these names or aliases have leading or trailing spaces. \end{note} -\rSec1[lex.comment]{Comments} +\rSec1[lex.whitespace]{Whitespace} +\indextext{whitespace|(}% + +\rSec2[lex.whitespace.general]{General} + +\indextext{character!whitespace|(}% +\begin{bnf} +\nontermdef{whitespace-character}\br + \unicode{0009}{character tabulation}\br + \unicode{000b}{line tabulation}\br + \unicode{000c}{form feed}\br + \unicode{0020}{space}\br +\end{bnf} + +\pnum +Sequences of \grammarterm{whitespace-character}s, new-line characters, and +comments\iref{lex.comment} form \defn{whitespace}, which carries no +semantic significance other than to separate tokens\iref{lex.token} +and preprocessing tokens\iref{lex.pptoken}. + +\pnum +\begin{note} +Implementations are permitted but not required to coalesce non-empty +sequences of whitespace into a single \unicode{0020}{space} +while retaining new-lines\iref{lex.phases}. +\end{note} + +\rSec2[lex.comment]{Comments} \pnum \indextext{comment|(}% @@ -477,8 +508,8 @@ characters \tcode{*/}. These comments do not nest. \indextext{comment!\tcode{//}}% The characters \tcode{//} start a comment, which terminates immediately before the -next new-line character. If there is a form-feed or a vertical-tab -character in such a comment, only whitespace characters shall appear +next new-line character. If there is a \unicode{000b}{line tabulation} or a \unicode{000c}{form feed} +character in such a comment, only \grammarterm{whitespace-character}s shall appear between it and the new-line that terminates the comment; no diagnostic is required. \begin{note} @@ -488,7 +519,14 @@ characters \tcode{//} and \tcode{/*} have no special meaning within a \tcode{/*} comment. \end{note} + +\pnum +\begin{note} +Comments are turned into \unicode{0020}{space} characters in phase 3 of translation +as part of decomposing a source file into preprocessor tokens and whitespace. +\end{note} \indextext{comment|)} +\indextext{whitespace|)}% \rSec1[lex.pptoken]{Preprocessing tokens} @@ -506,7 +544,7 @@ string-literal\br user-defined-string-literal\br preprocessing-op-or-punc\br - \textnormal{each non-whitespace character that cannot be one of the above} + \textnormal{each non-\grammarterm{whitespace-character} that cannot be one of the above} \end{bnf} \pnum @@ -520,22 +558,15 @@ (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}), identifiers, preprocessing numbers, character literals (including user-defined character literals), string literals (including user-defined string literals), preprocessing -operators and punctuators, and single non-whitespace characters that do not lexically +operators and punctuators, +and single non-\grammarterm{whitespace-character}s that do not lexically match the other preprocessing token categories. If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character matches the last category, the program is ill-formed. If any character not in the basic character set matches the last category, the program is ill-formed. -Preprocessing tokens can be separated by \indextext{whitespace}% -whitespace; -\indextext{comment}% -this consists of comments\iref{lex.comment}, or whitespace characters -(\unicode{0020}{space}, -\unicode{0009}{character tabulation}, -new-line, -\unicode{000b}{line tabulation}, and -\unicode{000c}{form feed}), or both. +Preprocessing tokens can be separated by whitespace\iref{lex.whitespace}. As described in \ref{cpp}, in certain circumstances during translation phase 4, whitespace (or the absence thereof) serves as more than preprocessing token separation. Whitespace @@ -824,9 +855,7 @@ \end{footnote} operators, and other separators. \indextext{whitespace}% -Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments -(collectively, ``whitespace''), as described below, are ignored except -as they serve to separate tokens. +Whitespace\iref{lex.whitespace} is ignored except to separate tokens. \begin{note} Whitespace can separate otherwise adjacent identifiers, keywords, numeric literals, and alternative tokens containing alphabetic characters. @@ -1786,8 +1815,8 @@ \begin{bnf} \nontermdef{d-char}\br \textnormal{any member of the basic character set except:}\br - \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br - \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line} + \bnfindent\textnormal{a \grammarterm{whitespace-character}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis},}\br + \bnfindent\textnormal{\unicode{005c}{reverse solidus}, and new-line} \end{bnf} \pnum diff --git a/source/preprocessor.tex b/source/preprocessor.tex index 64a6a72ccc..2ecdee4f19 100644 --- a/source/preprocessor.tex +++ b/source/preprocessor.tex @@ -289,12 +289,12 @@ the directive name and the following new-line character. \pnum -The only whitespace characters that shall appear +The only \grammarterm{whitespace-character}s that shall appear between preprocessing tokens within a preprocessing directive (from just after the directive-introducing token through just before the terminating new-line character) -are space and horizontal-tab +are \unicode{0020}{space} and \unicode{0009}{character tabulation} (including spaces that have replaced comments or possibly other whitespace characters in translation phase 3). @@ -1496,7 +1496,7 @@ \indextext{name!macro|see{macro, name}}% \defnx{macro name}{macro!name}. There is one name space for macro names. -Any whitespace characters preceding or following the +Any \grammarterm{whitespace-character}s preceding or following the replacement list of preprocessing tokens are not considered part of the replacement list for either form of macro. @@ -1573,7 +1573,7 @@ right parenthesis preprocessing tokens. Within the sequence of preprocessing tokens making up an invocation of a function-like macro, -new-line is considered a normal whitespace character. +new-line is considered a {whitespace-character}. \pnum \indextext{macro!function-like!arguments}%