diff --git a/source/lex.tex b/source/lex.tex index 6f5bffac97..753f8fb625 100644 --- a/source/lex.tex +++ b/source/lex.tex @@ -110,9 +110,9 @@ \indextext{line splicing}% If the first translation character is \unicode{feff}{byte order mark}, it is deleted. -Each sequence of a backslash character (\textbackslash) +Each sequence of a \unicode{005c}{reverse solidus} character (\tcode{\textbackslash}) immediately followed by -zero or more whitespace characters other than new-line followed by +zero or more \grammarterm{whitespace-character}s followed by a new-line character is deleted, splicing physical source lines to form \defnx{logical source lines}{source line!logical}. Only the last backslash on any physical source line shall be eligible for being part @@ -126,9 +126,13 @@ shall be processed as if an additional new-line character were appended to the file. -\item The source file is decomposed into preprocessing -tokens\iref{lex.pptoken} and sequences of whitespace characters -(including comments). A source file shall not end in a partial +\item +\indextext{whitespace}% +\indextext{comment}% +\indextext{token!preprocessing}% +The source file is decomposed into preprocessing tokens\iref{lex.pptoken} and +whitespace\iref{lex.whitespace}. +A source file shall not end in a partial preprocessing token or in a partial comment. \begin{footnote} A partial preprocessing @@ -140,10 +144,10 @@ would arise from a source file ending with an unclosed \tcode{/*} comment. \end{footnote} -Each comment\iref{lex.comment} is replaced by one space character. New-line characters are -retained. Whether each nonempty sequence of whitespace characters other -than new-line is retained or replaced by one space character is -unspecified. +Each comment\iref{lex.comment} is replaced by one \unicode{0020}{space} character. +New-line characters are retained. +Whether each nonempty sequence of \grammarterm{whitespace-character}s is +retained or replaced by one \unicode{0020}{space} character is unspecified. As characters from the source file are consumed to form the next preprocessing token (i.e., not being consumed as part of a comment or other forms of whitespace), @@ -181,7 +185,7 @@ \item Each preprocessing token is converted into a token\iref{lex.token}. -Whitespace characters separating tokens are no longer significant. +Whitespace separating tokens is no longer significant. The resulting tokens constitute a \defn{translation unit} and are syntactically and semantically analyzed as a \grammarterm{translation-unit}\iref{basic.link} and @@ -467,7 +471,34 @@ None of these names or aliases have leading or trailing spaces. \end{note} -\rSec1[lex.comment]{Comments} +\rSec1[lex.whitespace]{Whitespace} +\indextext{whitespace|(}% + +\rSec2[lex.whitespace.general]{General} + +\indextext{character!whitespace|(}% +\begin{bnf} +\nontermdef{whitespace-character}\br + \unicode{0009}{character tabulation}\br + \unicode{000b}{line tabulation}\br + \unicode{000c}{form feed}\br + \unicode{0020}{space}\br +\end{bnf} + +\pnum +Sequences of \grammarterm{whitespace-character}s, new-line characters, and +comments\iref{lex.comment} form \defn{whitespace}, which carries no +semantic significance other than to separate tokens\iref{lex.token} +and preprocessing tokens\iref{lex.pptoken}. + +\pnum +\begin{note} +Implementations are permitted but not required to coalesce non-empty +sequences of whitespace into a single \unicode{0020}{space} +while retaining new-lines\iref{lex.phases}. +\end{note} + +\rSec2[lex.comment]{Comments} \pnum \indextext{comment|(}% @@ -477,8 +508,8 @@ characters \tcode{*/}. These comments do not nest. \indextext{comment!\tcode{//}}% The characters \tcode{//} start a comment, which terminates immediately before the -next new-line character. If there is a form-feed or a vertical-tab -character in such a comment, only whitespace characters shall appear +next new-line character. If there is a \unicode{000b}{line tabulation} or a \unicode{000c}{form feed} +character in such a comment, only \grammarterm{whitespace-character}s shall appear between it and the new-line that terminates the comment; no diagnostic is required. \begin{note} @@ -488,7 +519,14 @@ characters \tcode{//} and \tcode{/*} have no special meaning within a \tcode{/*} comment. \end{note} + +\pnum +\begin{note} +Comments are turned into \unicode{0020}{space} characters in phase 3 of translation +as part of decomposing a source file into preprocessor tokens and whitespace. +\end{note} \indextext{comment|)} +\indextext{whitespace|)}% \rSec1[lex.pptoken]{Preprocessing tokens} @@ -506,7 +544,7 @@ string-literal\br user-defined-string-literal\br preprocessing-op-or-punc\br - \textnormal{each non-whitespace character that cannot be one of the above} + \textnormal{each non-\grammarterm{whitespace-character} that cannot be one of the above} \end{bnf} \pnum @@ -520,22 +558,15 @@ (\grammarterm{import-keyword}, \grammarterm{module-keyword}, and \grammarterm{export-keyword}), identifiers, preprocessing numbers, character literals (including user-defined character literals), string literals (including user-defined string literals), preprocessing -operators and punctuators, and single non-whitespace characters that do not lexically +operators and punctuators, +and single non-\grammarterm{whitespace-character}s that do not lexically match the other preprocessing token categories. If a \unicode{0027}{apostrophe} or a \unicode{0022}{quotation mark} character matches the last category, the program is ill-formed. If any character not in the basic character set matches the last category, the program is ill-formed. -Preprocessing tokens can be separated by \indextext{whitespace}% -whitespace; -\indextext{comment}% -this consists of comments\iref{lex.comment}, or whitespace characters -(\unicode{0020}{space}, -\unicode{0009}{character tabulation}, -new-line, -\unicode{000b}{line tabulation}, and -\unicode{000c}{form feed}), or both. +Preprocessing tokens can be separated by whitespace\iref{lex.whitespace}. As described in \ref{cpp}, in certain circumstances during translation phase 4, whitespace (or the absence thereof) serves as more than preprocessing token separation. Whitespace @@ -824,9 +855,7 @@ \end{footnote} operators, and other separators. \indextext{whitespace}% -Blanks, horizontal and vertical tabs, newlines, formfeeds, and comments -(collectively, ``whitespace''), as described below, are ignored except -as they serve to separate tokens. +Whitespace\iref{lex.whitespace} is ignored except to separate tokens. \begin{note} Whitespace can separate otherwise adjacent identifiers, keywords, numeric literals, and alternative tokens containing alphabetic characters. @@ -1786,8 +1815,8 @@ \begin{bnf} \nontermdef{d-char}\br \textnormal{any member of the basic character set except:}\br - \bnfindent\textnormal{\unicode{0020}{space}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis}, \unicode{005c}{reverse solidus},}\br - \bnfindent\textnormal{\unicode{0009}{character tabulation}, \unicode{000b}{line tabulation}, \unicode{000c}{form feed}, and new-line} + \bnfindent\textnormal{a \grammarterm{whitespace-character}, \unicode{0028}{left parenthesis}, \unicode{0029}{right parenthesis},}\br + \bnfindent\textnormal{\unicode{005c}{reverse solidus}, and new-line} \end{bnf} \pnum diff --git a/source/preprocessor.tex b/source/preprocessor.tex index 64a6a72ccc..2ecdee4f19 100644 --- a/source/preprocessor.tex +++ b/source/preprocessor.tex @@ -289,12 +289,12 @@ the directive name and the following new-line character. \pnum -The only whitespace characters that shall appear +The only \grammarterm{whitespace-character}s that shall appear between preprocessing tokens within a preprocessing directive (from just after the directive-introducing token through just before the terminating new-line character) -are space and horizontal-tab +are \unicode{0020}{space} and \unicode{0009}{character tabulation} (including spaces that have replaced comments or possibly other whitespace characters in translation phase 3). @@ -1496,7 +1496,7 @@ \indextext{name!macro|see{macro, name}}% \defnx{macro name}{macro!name}. There is one name space for macro names. -Any whitespace characters preceding or following the +Any \grammarterm{whitespace-character}s preceding or following the replacement list of preprocessing tokens are not considered part of the replacement list for either form of macro. @@ -1573,7 +1573,7 @@ right parenthesis preprocessing tokens. Within the sequence of preprocessing tokens making up an invocation of a function-like macro, -new-line is considered a normal whitespace character. +new-line is considered a {whitespace-character}. \pnum \indextext{macro!function-like!arguments}%