From dffd88ee51447d45b373a64f4434ab140e433c2a Mon Sep 17 00:00:00 2001
From: Rintaro Ishizaki <rishizaki@apple.com>
Date: Tue, 9 Dec 2025 12:17:27 -0800
Subject: [PATCH] [Parser] Eliminate 'CommentRetentionMode::None' in Lexer

Lexer should always set `Token.CommentLength` correctly because it
necessary for restoring to the token position with comments.
Otherwise, 'Token::isAtStartOfLine()' might not correctly set.
---
 include/swift/Parse/Lexer.h              |  3 +-
 include/swift/Parse/Parser.h             |  4 +++
 lib/IDE/Formatting.cpp                   | 10 +++----
 lib/Parse/Lexer.cpp                      |  4 +--
 lib/Parse/ParseDecl.cpp                  |  2 +-
 lib/Parse/ParseGeneric.cpp               |  2 +-
 lib/Parse/Parser.cpp                     | 34 +++++++++-------------
 lib/Refactoring/Async/AsyncConverter.cpp |  4 +--
 lib/Sema/MiscDiagnostics.cpp             |  9 +++---
 test/Parse/backgtrack-startofline.swift  |  7 +++++
 unittests/Parse/LexerTests.cpp           | 36 +++---------------------
 11 files changed, 45 insertions(+), 70 deletions(-)
 create mode 100644 test/Parse/backgtrack-startofline.swift
diff --git a/include/swift/Parse/Lexer.h b/include/swift/Parse/Lexer.h
index 5390a059892d7..d845ba0d1932d 100644
--- a/include/swift/Parse/Lexer.h
+++ b/include/swift/Parse/Lexer.h
@@ -39,7 +39,6 @@ namespace swift {
   template<typename ...T> struct Diag;
 
 enum class CommentRetentionMode {
-  None,
   AttachToNextToken,
   ReturnAsTokens,
 };
@@ -185,7 +184,7 @@ class Lexer {
       const LangOptions &Options, const SourceManager &SourceMgr,
       unsigned BufferID, DiagnosticEngine *Diags, LexerMode LexMode,
       HashbangMode HashbangAllowed = HashbangMode::Disallowed,
-      CommentRetentionMode RetainComments = CommentRetentionMode::None);
+      CommentRetentionMode RetainComments = CommentRetentionMode::AttachToNextToken);
 
   /// Create a lexer that scans a subrange of the source buffer.
   Lexer(const LangOptions &Options, const SourceManager &SourceMgr,
diff --git a/include/swift/Parse/Parser.h b/include/swift/Parse/Parser.h
index 9877dfddc848a..c06cdc402a9e8 100644
--- a/include/swift/Parse/Parser.h
+++ b/include/swift/Parse/Parser.h
@@ -693,6 +693,10 @@ class Parser {
     return Context.LangOpts.EnableExperimentalConcurrency;
   }
 
+  bool shouldAttachCommentToDecl() {
+    return Context.LangOpts.AttachCommentsToDecls;
+  }
+
   /// Returns true if a Swift declaration starts after the current token,
   /// otherwise returns false.
   bool isNextStartOfSwiftDecl() {
diff --git a/lib/IDE/Formatting.cpp b/lib/IDE/Formatting.cpp
index cbe1d9ca7422a..364c990da08e5 100644
--- a/lib/IDE/Formatting.cpp
+++ b/lib/IDE/Formatting.cpp
@@ -49,7 +49,7 @@ static void widenOrSet(SourceRange &First, SourceRange Second) {
 static bool isFirstTokenOnLine(SourceManager &SM, SourceLoc Loc) {
   assert(Loc.isValid());
   SourceLoc LineStart = Lexer::getLocForStartOfLine(SM, Loc);
-  CommentRetentionMode SkipComments = CommentRetentionMode::None;
+  CommentRetentionMode SkipComments = CommentRetentionMode::AttachToNextToken;
   Token First = Lexer::getTokenAtLocation(SM, LineStart, SkipComments);
   return First.getLoc() == Loc;
 }
@@ -76,8 +76,8 @@ static std::optional<Token> getTokenAfter(SourceManager &SM, SourceLoc Loc,
                                           bool SkipComments = true) {
   assert(Loc.isValid());
   CommentRetentionMode Mode = SkipComments
-    ? CommentRetentionMode::None
-    : CommentRetentionMode::ReturnAsTokens;
+                                  ? CommentRetentionMode::AttachToNextToken
+                                  : CommentRetentionMode::ReturnAsTokens;
   assert(Lexer::getTokenAtLocation(SM, Loc, Mode).getLoc() == Loc);
   SourceLoc End = Lexer::getLocForEndOfToken(SM, Loc);
   Token Next = Lexer::getTokenAtLocation(SM, End, Mode);
@@ -814,8 +814,8 @@ class OutdentChecker: protected RangeWalker {
     } else if (R.isValid()) {
       // Check condition 2a.
       SourceLoc LineStart = Lexer::getLocForStartOfLine(SM, R);
-      Token First = Lexer::getTokenAtLocation(SM, LineStart,
-                                              CommentRetentionMode::None);
+      Token First = Lexer::getTokenAtLocation(
+          SM, LineStart, CommentRetentionMode::AttachToNextToken);
       IsOutdenting |= First.getLoc() == R;
     }
 
diff --git a/lib/Parse/Lexer.cpp b/lib/Parse/Lexer.cpp
index d9f3c70ffddc9..ca0cb2dd8de84 100644
--- a/lib/Parse/Lexer.cpp
+++ b/lib/Parse/Lexer.cpp
@@ -270,7 +270,7 @@ Token Lexer::getTokenAt(SourceLoc Loc) {
          "location from the wrong buffer");
 
   Lexer L(LangOpts, SourceMgr, BufferID, getUnderlyingDiags(), LexMode,
-          HashbangMode::Allowed, CommentRetentionMode::None);
+          HashbangMode::Allowed, CommentRetentionMode::AttachToNextToken);
   L.restoreState(State(Loc));
   return L.peekNextToken();
 }
@@ -3029,7 +3029,7 @@ static SourceLoc getLocForStartOfTokenInBuf(SourceManager &SM,
   LangOptions FakeLangOptions;
 
   Lexer L(FakeLangOptions, SM, BufferID, nullptr, LexerMode::Swift,
-          HashbangMode::Allowed, CommentRetentionMode::None,
+          HashbangMode::Allowed, CommentRetentionMode::AttachToNextToken,
           BufferStart, BufferEnd);
 
   // Lex tokens until we find the token that contains the source location.
diff --git a/lib/Parse/ParseDecl.cpp b/lib/Parse/ParseDecl.cpp
index f80bee63975e0..190d52c256328 100644
--- a/lib/Parse/ParseDecl.cpp
+++ b/lib/Parse/ParseDecl.cpp
@@ -6204,7 +6204,7 @@ ParserStatus Parser::parseDecl(bool IsAtStartOfLineOrPreviousHadSemi,
 
   // Parse attributes.
   DeclAttributes Attributes;
-  if (Tok.hasComment())
+  if (Tok.hasComment() && shouldAttachCommentToDecl())
     Attributes.add(new (Context) RawDocCommentAttr(Tok.getCommentRange()));
   ParserStatus AttrStatus = parseDeclAttributeList(
       Attributes, IfConfigsAreDeclAttrs);
diff --git a/lib/Parse/ParseGeneric.cpp b/lib/Parse/ParseGeneric.cpp
index ea0cfa8d9e9f5..e2450965022d9 100644
--- a/lib/Parse/ParseGeneric.cpp
+++ b/lib/Parse/ParseGeneric.cpp
@@ -56,7 +56,7 @@ Parser::parseGenericParametersBeforeWhere(SourceLoc LAngleLoc,
 
     // Parse attributes.
     DeclAttributes attributes;
-    if (Tok.hasComment())
+    if (Tok.hasComment() && shouldAttachCommentToDecl())
       attributes.add(new (Context) RawDocCommentAttr(Tok.getCommentRange()));
     parseDeclAttributeList(attributes);
 
diff --git a/lib/Parse/Parser.cpp b/lib/Parse/Parser.cpp
index 92b392131b44d..4b7c67f982542 100644
--- a/lib/Parse/Parser.cpp
+++ b/lib/Parse/Parser.cpp
@@ -341,21 +341,15 @@ Parser::Parser(unsigned BufferID, SourceFile &SF, SILParserStateBase *SIL,
                PersistentParserState *PersistentState)
     : Parser(BufferID, SF, &SF.getASTContext().Diags, SIL, PersistentState) {}
 
-Parser::Parser(unsigned BufferID, SourceFile &SF, DiagnosticEngine* LexerDiags,
-               SILParserStateBase *SIL,
-               PersistentParserState *PersistentState)
-    : Parser(
-          std::unique_ptr<Lexer>(new Lexer(
-              SF.getASTContext().LangOpts, SF.getASTContext().SourceMgr,
-              BufferID, LexerDiags,
-              sourceFileKindToLexerMode(SF.Kind),
-              SF.Kind == SourceFileKind::Main
-                  ? HashbangMode::Allowed
-                  : HashbangMode::Disallowed,
-              SF.getASTContext().LangOpts.AttachCommentsToDecls
-                  ? CommentRetentionMode::AttachToNextToken
-                  : CommentRetentionMode::None)),
-          SF, SIL, PersistentState) {}
+Parser::Parser(unsigned BufferID, SourceFile &SF, DiagnosticEngine *LexerDiags,
+               SILParserStateBase *SIL, PersistentParserState *PersistentState)
+    : Parser(std::unique_ptr<Lexer>(new Lexer(
+                 SF.getASTContext().LangOpts, SF.getASTContext().SourceMgr,
+                 BufferID, LexerDiags, sourceFileKindToLexerMode(SF.Kind),
+                 SF.Kind == SourceFileKind::Main ? HashbangMode::Allowed
+                                                 : HashbangMode::Disallowed,
+                 CommentRetentionMode::AttachToNextToken)),
+             SF, SIL, PersistentState) {}
 
 namespace {
 
@@ -1243,12 +1237,10 @@ ParserUnit::ParserUnit(SourceManager &SM, SourceFileKind SFKind,
     : Impl(*new Implementation(SM, SFKind, BufferID, LangOptions(), "input")) {
 
   std::unique_ptr<Lexer> Lex;
-  Lex.reset(new Lexer(Impl.LangOpts, SM,
-                      BufferID, &Impl.Diags,
-                      LexerMode::Swift,
-                      HashbangMode::Allowed,
-                      CommentRetentionMode::None,
-                      Offset, EndOffset));
+  Lex.reset(new Lexer(Impl.LangOpts, SM, BufferID, &Impl.Diags,
+                      LexerMode::Swift, HashbangMode::Allowed,
+                      CommentRetentionMode::AttachToNextToken, Offset,
+                      EndOffset));
   Impl.TheParser.reset(new Parser(std::move(Lex), *Impl.SF, /*SIL=*/nullptr,
                                   /*PersistentState=*/nullptr));
 }
diff --git a/lib/Refactoring/Async/AsyncConverter.cpp b/lib/Refactoring/Async/AsyncConverter.cpp
index 1b4bc8708b79f..156ee48444be0 100644
--- a/lib/Refactoring/Async/AsyncConverter.cpp
+++ b/lib/Refactoring/Async/AsyncConverter.cpp
@@ -651,8 +651,8 @@ void AsyncConverter::addFuncDecl(const FuncDecl *FD) {
     LeftEndLoc = Lexer::getLocForEndOfToken(
         SM, Params->get(TopHandler.Index - 1)->getEndLoc());
     // Skip to the end of any comments
-    Token Next =
-        Lexer::getTokenAtLocation(SM, LeftEndLoc, CommentRetentionMode::None);
+    Token Next = Lexer::getTokenAtLocation(
+        SM, LeftEndLoc, CommentRetentionMode::AttachToNextToken);
     if (Next.getKind() != tok::NUM_TOKENS)
       LeftEndLoc = Next.getLoc();
     break;
diff --git a/lib/Sema/MiscDiagnostics.cpp b/lib/Sema/MiscDiagnostics.cpp
index 0710fddd3aabe..81ce8ab549a86 100644
--- a/lib/Sema/MiscDiagnostics.cpp
+++ b/lib/Sema/MiscDiagnostics.cpp
@@ -2559,8 +2559,9 @@ class ImplicitSelfUsageChecker : public BaseDiagnosticWalker {
     // insert 'self,'. If it wasn't a valid entry, then we will at least not
     // be introducing any new errors/warnings...
     const auto locAfterBracket = brackets.Start.getAdvancedLoc(1);
-    const auto nextAfterBracket = Lexer::getTokenAtLocation(
-        Ctx.SourceMgr, locAfterBracket, CommentRetentionMode::None);
+    const auto nextAfterBracket =
+        Lexer::getTokenAtLocation(Ctx.SourceMgr, locAfterBracket,
+                                  CommentRetentionMode::AttachToNextToken);
     if (nextAfterBracket.getLoc() != brackets.End)
       diag.fixItInsertAfter(brackets.Start, "self, ");
     else
@@ -2581,8 +2582,8 @@ class ImplicitSelfUsageChecker : public BaseDiagnosticWalker {
     // opening brace of the closure, we may need to pad the fix-it
     // with a space.
     const auto nextLoc = closureExpr->getLoc().getAdvancedLoc(1);
-    const auto next = Lexer::getTokenAtLocation(Ctx.SourceMgr, nextLoc,
-                                                CommentRetentionMode::None);
+    const auto next = Lexer::getTokenAtLocation(
+        Ctx.SourceMgr, nextLoc, CommentRetentionMode::AttachToNextToken);
     std::string trailing = next.getLoc() == nextLoc ? " " : "";
 
     diag.fixItInsertAfter(closureExpr->getLoc(), " [self] in" + trailing);
diff --git a/test/Parse/backgtrack-startofline.swift b/test/Parse/backgtrack-startofline.swift
new file mode 100644
index 0000000000000..d59a4b9639a14
--- /dev/null
+++ b/test/Parse/backgtrack-startofline.swift
@@ -0,0 +1,7 @@
+// RUN: %target-typecheck-verify-swift -verify-ignore-unrelated
+
+func test() {
+  _ = "test"
+  /* comment */ #if true
+  #endif
+}
diff --git a/unittests/Parse/LexerTests.cpp b/unittests/Parse/LexerTests.cpp
index 82b9d276c7f82..f016dc020949c 100644
--- a/unittests/Parse/LexerTests.cpp
+++ b/unittests/Parse/LexerTests.cpp
@@ -271,32 +271,6 @@ TEST_F(LexerTest, ContentStartTokenIsStartOfLineUTF8BOM) {
   ASSERT_TRUE(Tok.isAtStartOfLine());
 }
 
-TEST_F(LexerTest, BOMNoCommentNoTrivia) {
-  const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
-  
-  LangOptions LangOpts;
-  SourceManager SourceMgr;
-  unsigned BufferID = SourceMgr.addMemBufferCopy(StringRef(Source));
-  
-  Lexer L(LangOpts, SourceMgr, BufferID, /*Diags=*/nullptr, LexerMode::Swift,
-          HashbangMode::Disallowed, CommentRetentionMode::None);
-  
-  Token Tok;
-
-  L.lex(Tok);
-  ASSERT_EQ(tok::identifier, Tok.getKind());
-  ASSERT_EQ("aaa", Tok.getText());
-  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getLoc());
-  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 14), Tok.getCommentRange().getStart());
-  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
-  
-  L.lex(Tok);
-  ASSERT_EQ(tok::eof, Tok.getKind());
-  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getLoc());
-  ASSERT_EQ(SourceMgr.getLocForOffset(BufferID, 31), Tok.getCommentRange().getStart());
-  ASSERT_EQ(0u, Tok.getCommentRange().getByteLength());
-}
-
 TEST_F(LexerTest, BOMTokenCommentNoTrivia) {
   const char *Source = "\xEF\xBB\xBF" "// comment\naaa //xx \n/* x */";
   
@@ -718,9 +692,8 @@ TEST_F(LexerTest, DiagnoseEmbeddedNul) {
   DiagnosticEngine Diags(SourceMgr);
   Diags.addConsumer(DiagConsumer);
 
-  Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
-          LexerMode::Swift, HashbangMode::Disallowed,
-          CommentRetentionMode::None);
+  Lexer L(LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift,
+          HashbangMode::Disallowed, CommentRetentionMode::AttachToNextToken);
 
   Token Tok;
   L.lex(Tok);
@@ -743,9 +716,8 @@ TEST_F(LexerTest, DiagnoseEmbeddedNulOffset) {
   DiagnosticEngine Diags(SourceMgr);
   Diags.addConsumer(DiagConsumer);
 
-  Lexer L(LangOpts, SourceMgr, BufferID, &Diags,
-          LexerMode::Swift, HashbangMode::Disallowed,
-          CommentRetentionMode::None,
+  Lexer L(LangOpts, SourceMgr, BufferID, &Diags, LexerMode::Swift,
+          HashbangMode::Disallowed, CommentRetentionMode::AttachToNextToken,
           /*Offset=*/5, /*EndOffset=*/SourceLen);
 
   ASSERT_FALSE(containsPrefix(