cplusplus · AlisdairM · Oct 28, 2024
diff --git a/source/lex.tex b/source/lex.tex
@@ -110,9 +110,9 @@
 \indextext{line splicing}%
 If the first translation character is \unicode{feff}{byte order mark},
 it is deleted.
-Each sequence of a backslash character (\textbackslash)
+Each sequence of a \unicode{005c}{reverse solidus} character (\tcode{\textbackslash})
 immediately followed by
-zero or more whitespace characters other than new-line followed by
+zero or more \grammarterm{whitespace-character}s followed by
 a new-line character is deleted, splicing
 physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last
 backslash on any physical source line shall be eligible for being part
@@ -126,9 +126,13 @@
 shall be processed as if an additional new-line character were appended
 to the file.
 
-\item The source file is decomposed into preprocessing
-tokens\iref{lex.pptoken} and sequences of whitespace characters
-(including comments). A source file shall not end in a partial
+\item
+\indextext{whitespace}%
+\indextext{comment}%
+\indextext{token!preprocessing}%
+The source file is decomposed into preprocessing tokens\iref{lex.pptoken} and
+whitespace\iref{lex.whitespace}.
+A source file shall not end in a partial
 preprocessing token or in a partial comment.
 \begin{footnote}
 A partial preprocessing
@@ -140,10 +144,10 @@
 would arise from a source file ending with an unclosed \tcode{/*}
 comment.
 \end{footnote}
-Each comment\iref{lex.comment} is replaced by one space character. New-line characters are
-retained. Whether each nonempty sequence of whitespace characters other
-than new-line is retained or replaced by one space character is
-unspecified.
+Each comment\iref{lex.comment} is replaced by one \unicode{0020}{space} character.
+New-line characters are retained.
+Whether each nonempty sequence of \grammarterm{whitespace-character}s is
+retained or replaced by one \unicode{0020}{space} character is unspecified.
 As characters from the source file are consumed
 to form the next preprocessing token
 (i.e., not being consumed as part of a comment or other forms of whitespace),
@@ -181,7 +185,7 @@
 
 \item
 Each preprocessing token is converted into a token\iref{lex.token}.
-Whitespace characters separating tokens are no longer significant.
+Whitespace separating tokens is no longer significant.
 The resulting tokens constitute a \defn{translation unit} and
 are syntactically and
 semantically analyzed as a \grammarterm{translation-unit}\iref{basic.link} and
@@ -467,7 +471,34 @@
 None of these names or aliases have leading or trailing spaces.
 \end{note}
 
-\rSec1[lex.comment]{Comments}
+\rSec1[lex.whitespace]{Whitespace}
+\indextext{whitespace|(}%
+
+\rSec2[lex.whitespace.general]{General}
+
+\indextext{character!whitespace|(}%
+\begin{bnf}
+\nontermdef{whitespace-character}\br
+    \unicode{0009}{character tabulation}\br
+    \unicode{000b}{line tabulation}\br
+    \unicode{000c}{form feed}\br
+    \unicode{0020}{space}\br
+\end{bnf}
+
+\pnum
+Sequences of \grammarterm{whitespace-character}s, new-line characters, and
+comments\iref{lex.comment} form \defn{whitespace}, which carries no
+semantic significance other than to separate tokens\iref{lex.token}
+and preprocessing tokens\iref{lex.pptoken}.
+
+\pnum
+\begin{note}
+Implementations are permitted but not required to coalesce non-empty
+sequences of whitespace into a single \unicode{0020}{space}
+while retaining new-lines\iref{lex.phases}.
+\end{note}
+
+\rSec2[lex.comment]{Comments}
 
 \pnum
 \indextext{comment|(}%
@@ -477,8 +508,8 @@
 characters \tcode{*/}. These comments do not nest.
 \indextext{comment!\tcode{//}}%
 The characters \tcode{//} start a comment, which terminates immediately before the
-next new-line character. If there is a form-feed or a vertical-tab
-character in such a comment, only whitespace characters shall appear
+next new-line character. If there is a  \unicode{000b}{line tabulation} or a \unicode{000c}{form feed}
+character in such a comment, only \grammarterm{whitespace-character}s shall appear
 between it and the new-line that terminates the comment; no diagnostic
 is required.
 \begin{note}
@@ -488,7 +519,14 @@
 characters \tcode{//} and \tcode{/*} have no special meaning within a
 \tcode{/*} comment.
 \end{note}
+
+\pnum
+\begin{note}
+Comments are turned into \unicode{0020}{space} characters in phase 3 of translation
+as part of decomposing a source file into preprocessor tokens and whitespace.
+\end{note}
 \indextext{comment|)}
+\indextext{whitespace|)}%
 
 \rSec1[lex.pptoken]{Preprocessing tokens}
 
@@ -506,7 +544,7 @@
     string-literal\br
     user-defined-string-literal\br
     preprocessing-op-or-punc\br
-    \textnormal{each non-whitespace character that cannot be one of the above}
+    \textnormal{each non-\grammarterm{whitespace-character} that cannot be one of the above}
 \end{bnf}
 
 \pnum
@@ -520,22 +558,15 @@
 (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}),
 identifiers, preprocessing numbers, character literals (including user-defined character
 literals), string literals (including user-defined string literals), preprocessing
-operators and punctuators, and single non-whitespace characters that do not lexically
+operators and punctuators,
+and single non-\grammarterm{whitespace-character}s that do not lexically
 match the other preprocessing token categories.
 If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character
 matches the last category, the program is ill-formed.
 If any character not in the basic character set matches the last category,
 the program is ill-formed.
-Preprocessing tokens can be separated by
 \indextext{whitespace}%
-whitespace;
-\indextext{comment}%
-this consists of comments\iref{lex.comment}, or whitespace characters
-(\unicode{0020}{space},
-\unicode{0009}{character tabulation},
-new-line,
-\unicode{000b}{line tabulation}, and
-\unicode{000c}{form feed}), or both.
+Preprocessing tokens can be separated by whitespace\iref{lex.whitespace}.
 As described in \ref{cpp}, in certain
 circumstances during translation phase 4, whitespace (or the absence
 thereof) serves as more than preprocessing token separation. Whitespace
@@ -824,9 +855,7 @@
 \end{footnote}
 operators, and other separators.
 \indextext{whitespace}%
-Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments
-(collectively, ``whitespace''), as described below, are ignored except
-as they serve to separate tokens.
+Whitespace\iref{lex.whitespace} is ignored except to separate tokens.
 \begin{note}
 Whitespace can separate otherwise adjacent identifiers, keywords, numeric
 literals, and alternative tokens containing alphabetic characters.
@@ -1786,8 +1815,8 @@
 \begin{bnf}
 \nontermdef{d-char}\br
     \textnormal{any member of the basic character set except:}\br
-    \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br
-    \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line}
+    \bnfindent\textnormal{a \grammarterm{whitespace-character}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis},}\br
+    \bnfindent\textnormal{\unicode{005c}{reverse solidus}, and new-line}
 \end{bnf}
 
 \pnum

diff --git a/source/preprocessor.tex b/source/preprocessor.tex
@@ -289,12 +289,12 @@
 the directive name and the following new-line character.
 
 \pnum
-The only whitespace characters that shall appear
+The only \grammarterm{whitespace-character}s that shall appear
 between preprocessing tokens
 within a preprocessing directive
 (from just after the directive-introducing token
 through just before the terminating new-line character)
-are space and horizontal-tab
+are \unicode{0020}{space} and \unicode{0009}{character tabulation}
 (including spaces that have replaced comments
 or possibly other whitespace characters
 in translation phase 3).
@@ -1496,7 +1496,7 @@
 \indextext{name!macro|see{macro, name}}%
 \defnx{macro name}{macro!name}.
 There is one name space for macro names.
-Any whitespace characters preceding or following the
+Any \grammarterm{whitespace-character}s preceding or following the
 replacement list of preprocessing tokens are not considered
 part of the replacement list for either form of macro.
 
@@ -1573,7 +1573,7 @@
 right parenthesis preprocessing tokens.
 Within the sequence of preprocessing tokens making up an invocation
 of a function-like macro,
-new-line is considered a normal whitespace character.
+new-line is considered a {whitespace-character}.
 
 \pnum
 \indextext{macro!function-like!arguments}%