From 096d39d4051af8e918c9d8d77554487525ac48d7 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:51 +0100
Subject: [PATCH 1/7] Better filter trivia in dumps

Make sure we don't try and print things like
empty comma lists `,,,,` or redundant parens for
concatenations that had their trivia filtered out.
---
 .../_RegexParser/Regex/Printing/DumpAST.swift | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
index 47142407a..0e40ad2ce 100644
--- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift
+++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
@@ -44,18 +44,23 @@ extension _ASTPrintable {
     guard let children = _children else {
       return _dumpBase
     }
-    let sub = children.lazy.compactMap {
+    let childDump = children.compactMap { child -> String? in
       // Exclude trivia for now, as we don't want it to appear when performing
       // comparisons of dumped output in tests.
       // TODO: We should eventually have some way of filtering out trivia for
       // tests, so that it can appear in regular dumps.
-      if $0.isTrivia { return nil }
-      return $0._dump()
-    }.joined(separator: ",")
-    if sub.isEmpty {
-      return "\(_dumpBase)"
+      if child.isTrivia { return nil }
+      let dump = child._dump()
+      return !dump.isEmpty ? dump : nil
     }
-    return "\(_dumpBase)(\(sub))"
+    let base = "\(_dumpBase)"
+    if childDump.isEmpty {
+      return base
+    }
+    if childDump.count == 1, base.isEmpty {
+      return "\(childDump[0])"
+    }
+    return "\(base)(\(childDump.joined(separator: ",")))"
   }
 }
 
@@ -77,7 +82,7 @@ extension AST.Node: _ASTPrintable {
 }
 
 extension AST.Alternation {
-  public var _dumpBase: String { "alternation" }
+  public var _dumpBase: String { "alternation<\(children.count)>" }
 }
 
 extension AST.Concatenation {

From c6dc547908bd3aab852e04c47286a651b31d8c00 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:51 +0100
Subject: [PATCH 2/7] Formalize non-semantic whitespace matching

Turns out this is a Unicode-defined thing.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 26 ++++---------------
 .../_RegexParser/Utility/MissingUnicode.swift |  6 +++++
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index 4eb0ebea4..b595f3d29 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -550,28 +550,12 @@ extension Source {
   ) throws -> AST.Trivia? {
     guard context.ignoreWhitespace else { return nil }
 
-    func isWhitespace(_ c: Character) -> Bool {
-      // This is a list of characters that PCRE treats as whitespace when
-      // compiled with Unicode support. It is a subset of the characters with
-      // the `.isWhitespace` property. ICU appears to also follow this list.
-      // Oniguruma and .NET follow a subset of this list.
-      //
-      // FIXME: PCRE only treats space and tab characters as whitespace when
-      // inside a custom character class (and only treats whitespace as
-      // non-semantic there for the extra-extended `(?xx)` mode). If we get a
-      // strict-PCRE mode, we'll need to add a case for that.
-      switch c {
-      case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r
-           "\u{85}", "\u{200E}",   // next line, left-to-right mark
-           "\u{200F}", "\u{2028}", // right-to-left-mark, line separator
-           "\u{2029}":             // paragraph separator
-        return true
-      default:
-        return false
-      }
-    }
+    // FIXME: PCRE only treats space and tab characters as whitespace when
+    // inside a custom character class (and only treats whitespace as
+    // non-semantic there for the extra-extended `(?xx)` mode). If we get a
+    // strict-PCRE mode, we'll need to add a case for that.
     let trivia: Located<String>? = recordLoc { src in
-      src.tryEatPrefix(isWhitespace)?.string
+      src.tryEatPrefix(\.isPatternWhitespace)?.string
     }
     guard let trivia = trivia else { return nil }
     return AST.Trivia(trivia)
diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift
index dccba3286..4d819806b 100644
--- a/Sources/_RegexParser/Utility/MissingUnicode.swift
+++ b/Sources/_RegexParser/Utility/MissingUnicode.swift
@@ -660,6 +660,12 @@ extension Character {
   public var isOctalDigit: Bool { ("0"..."7").contains(self) }
 
   public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
+
+  /// Whether this character represents whitespace for the purposes of pattern
+  /// parsing.
+  public var isPatternWhitespace: Bool {
+    return unicodeScalars.first!.properties.isPatternWhitespace
+  }
 }
 
 extension UnicodeScalar {

From a96648badd28106b4db723aca44b1f83fa956ffe Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 3/7] Rename endOfString -> unterminated

---
 .../Regex/Parse/DelimiterLexing.swift            |  8 ++++----
 Sources/_RegexParser/Regex/Parse/Mocking.swift   |  2 +-
 Tests/RegexTests/ParseTests.swift                | 16 ++++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index 1227ade1f..e88c1fa80 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -41,7 +41,7 @@ enum Delimiter: Hashable, CaseIterable {
 
 struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
-    case endOfString
+    case unterminated
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
     case unprintableASCII
@@ -59,7 +59,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
 
   var description: String {
     switch kind {
-    case .endOfString: return "unterminated regex literal"
+    case .unterminated: return "unterminated regex literal"
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
     case .unprintableASCII: return "unprintable ASCII character found in source file"
@@ -238,7 +238,7 @@ fileprivate struct DelimiterLexer {
   /// the end of the buffer is reached.
   mutating func advance(escaped: Bool = false) throws {
     guard let next = load() else {
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      throw DelimiterLexError(.unterminated, resumeAt: cursor)
     }
     switch UnicodeScalar(next) {
     case let next where !next.isASCII:
@@ -249,7 +249,7 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
 
     case "\n", "\r":
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      throw DelimiterLexError(.unterminated, resumeAt: cursor)
 
     case "\0":
       // TODO: Warn to match the behavior of String literal lexer? Or should
diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift
index 5994a4f52..596a59bf4 100644
--- a/Sources/_RegexParser/Regex/Parse/Mocking.swift
+++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift
@@ -62,7 +62,7 @@ func libswiftLexRegexLiteral(
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
     switch error.kind {
-    case .endOfString:
+    case .unterminated:
       // Missing closing delimiter can be recovered from.
       return false
     case .unprintableASCII, .invalidUTF8:
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index f6f31c075..649ea22e2 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -2079,21 +2079,21 @@ extension RegexTests {
 
     // MARK: Printable ASCII
 
-    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
       delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
     }
-    delimiterLexingDiagnosticTest("re'\n'", .endOfString)
-    delimiterLexingDiagnosticTest("re'\r'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\n'", .unterminated)
+    delimiterLexingDiagnosticTest("re'\r'", .unterminated)
     delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
 
     // MARK: Delimiter skipping
 
-    delimiterLexingDiagnosticTest("re'(?''", .endOfString)
-    delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
-    delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
-    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
-    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
+    delimiterLexingDiagnosticTest("re'(?''", .unterminated)
+    delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated)
+    delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
+    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
+    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
   }
 
   func testlibswiftDiagnostics() {

From 120ffc90de110ed3e2d1af382cb2f0f093e340da Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 4/7] Fix end-of-line-comment lexing

Previously we would just lex to the end of the
input, as it was assumed only single-line regex
would be supported. Update the implementation to
handle multi-line, and take account of PCRE global
options.
---
 .../Regex/AST/MatchingOptions.swift           |   3 +-
 .../Regex/Parse/LexicalAnalysis.swift         |  26 ++-
 Sources/_RegexParser/Regex/Parse/Parse.swift  |  13 ++
 Sources/_RegexParser/Regex/Parse/Source.swift |   6 +
 Tests/RegexTests/ParseTests.swift             | 208 ++++++++++++++++++
 5 files changed, 250 insertions(+), 6 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
index 25cb10842..808b51287 100644
--- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
+++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
@@ -137,7 +137,8 @@ extension AST {
   /// Global matching option specifiers. Unlike `MatchingOptionSequence`,
   /// these must appear at the start of the pattern, and apply globally.
   public struct GlobalMatchingOption: _ASTNode, Hashable {
-    /// Determines the definition of a newline for the '.' character class.
+    /// Determines the definition of a newline for the '.' character class and
+    /// when parsing end-of-line comments.
     public enum NewlineMatching: Hashable {
       /// (*CR*)
       case carriageReturnOnly
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index b595f3d29..165e97d1a 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -528,11 +528,27 @@ extension Source {
         return try src.expectQuoted(endingWith: "*/").value
       }
       if context.endOfLineComments, src.tryEat("#") {
-        // TODO: If we ever support multi-line regex literals, this will need
-        // to be updated to stop at a newline. Note though that PCRE specifies
-        // that the newline it matches against can be controlled by the global
-        // matching options e.g `(*CR)`, `(*ANY)`, ...
-        return src.lexUntil(\.isEmpty).value
+        // Try eat until we either exhaust the input, or hit a newline. Note
+        // that the definition of newline can be altered depending on the global
+        // matching options. By default we consider a newline to be `\n` or
+        // `\r`.
+        return src.lexUntil { src in
+          if src.isEmpty { return true }
+          switch context.newlineMode {
+          case .carriageReturnOnly:
+            return src.tryEat("\r")
+          case .linefeedOnly:
+            return src.tryEat("\n")
+          case .carriageAndLinefeedOnly:
+            return src.tryEat("\r\n")
+          case .anyCarriageReturnOrLinefeed:
+            return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil
+          case .anyUnicode:
+            return src.tryEat(where: \.isNewline)
+          case .nulCharacter:
+            return src.tryEat("\0")
+          }
+        }.value
       }
       return nil
     }
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 7867073e6..2512f9bf2 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -76,6 +76,10 @@ struct ParsingContext {
   /// The syntax options currently set.
   fileprivate(set) var syntax: SyntaxOptions
 
+  /// The current newline matching mode.
+  fileprivate(set) var newlineMode: AST.GlobalMatchingOption.NewlineMatching
+    = .anyCarriageReturnOrLinefeed
+
   fileprivate mutating func recordGroup(_ g: AST.Group.Kind) {
     // TODO: Needs to track group number resets (?|...).
     priorGroupCount += 1
@@ -139,6 +143,15 @@ extension Parser {
     // First parse any global matching options if present.
     let opts = try source.lexGlobalMatchingOptionSequence()
 
+    // If we have a newline mode global option, update the context accordingly.
+    if let opts = opts {
+      for opt in opts.options.reversed() {
+        guard case .newlineMatching(let newline) = opt.kind else { continue }
+        context.newlineMode = newline
+        break
+      }
+    }
+
     // Then parse the root AST node.
     let ast = try parseNode()
     guard source.isEmpty else {
diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift
index ddf0475f3..6eac16395 100644
--- a/Sources/_RegexParser/Regex/Parse/Source.swift
+++ b/Sources/_RegexParser/Regex/Parse/Source.swift
@@ -68,6 +68,12 @@ extension Source {
     return true
   }
 
+  mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool {
+    guard let next = peek(), try pred(next) else { return false }
+    advance()
+    return true
+  }
+
   mutating func tryEat<C: Collection>(sequence c: C) -> Bool
   where C.Element == Char {
     guard _slice.starts(with: c) else { return false }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 649ea22e2..b185234a0 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1526,6 +1526,214 @@ extension RegexTests {
         matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b"))
     )
 
+    // Test multi-line comment handling.
+    parseTest(
+      """
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\n\r\
+      bc # d\n\r\
+      ef# g\n\r\
+      # h\n\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(empty(), opts: .newlineMatching(.carriageReturnOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)\r\
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*LF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CRLF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CRLF)
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANY)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\u{2028}\
+      bc # d
+      ef# g\u{2028}\
+      # h
+      """,
+      concat("e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANY)
+      # a\u{2028}\
+      bc # d\u{2028}\
+      ef# g\u{2028}\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*NUL)
+      # a
+      bc # d\0\
+      ef# g
+      # h
+      """,
+      ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*NUL)
+      # a\0\
+      bc # d\0\
+      ef# g\0\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)(*NUL)
+      # a\0\
+      bc # d\0\
+      ef# g\0\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"),
+          opts: .newlineMatching(.carriageReturnOnly),
+                .newlineMatching(.nulCharacter)
+         ),
+      syntax: .extendedSyntax
+    )
+
     // MARK: Parse with delimiters
 
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))

From 4944fbea80d5abbbcc2bc03cc511868aebae949e Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 5/7] Lex extended pound delimiters

Start lexing `/.../`, and allow any number of
pound signs to surround it.
---
 .../Regex/Parse/DelimiterLexing.swift         | 152 +++++++++++++-----
 Tests/RegexTests/LexTests.swift               |  25 +--
 Tests/RegexTests/ParseTests.swift             |  12 ++
 3 files changed, 143 insertions(+), 46 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index e88c1fa80..fa6ca978a 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -11,27 +11,27 @@
 
 // TODO: mock up multi-line soon
 
-enum Delimiter: Hashable, CaseIterable {
-  case traditional
-  case experimental
-  case reSingleQuote
-  case rxSingleQuote
-
-  var openingAndClosing: (opening: String, closing: String) {
-    switch self {
-    case .traditional: return ("#/", "/#")
-    case .experimental: return ("#|", "|#")
-    case .reSingleQuote: return ("re'", "'")
-    case .rxSingleQuote: return ("rx'", "'")
-    }
+struct Delimiter: Hashable {
+  let kind: Kind
+  let poundCount: Int
+
+  init(_ kind: Kind, poundCount: Int) {
+    precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
+    self.kind = kind
+    self.poundCount = poundCount
+  }
+
+  var opening: String {
+    String(repeating: "#", count: poundCount) + kind.opening
+  }
+  var closing: String {
+    kind.closing + String(repeating: "#", count: poundCount)
   }
-  var opening: String { openingAndClosing.opening }
-  var closing: String { openingAndClosing.closing }
 
   /// The default set of syntax options that the delimiter indicates.
   var defaultSyntaxOptions: SyntaxOptions {
-    switch self {
-    case .traditional, .reSingleQuote:
+    switch kind {
+    case .forwardSlash, .reSingleQuote:
       return .traditional
     case .experimental, .rxSingleQuote:
       return .experimental
@@ -39,6 +39,37 @@ enum Delimiter: Hashable, CaseIterable {
   }
 }
 
+extension Delimiter {
+  enum Kind: Hashable, CaseIterable {
+    case forwardSlash
+    case experimental
+    case reSingleQuote
+    case rxSingleQuote
+
+    var openingAndClosing: (opening: String, closing: String) {
+      switch self {
+      case .forwardSlash: return ("/", "/")
+      case .experimental: return ("#|", "|#")
+      case .reSingleQuote: return ("re'", "'")
+      case .rxSingleQuote: return ("rx'", "'")
+      }
+    }
+    var opening: String { openingAndClosing.opening }
+    var closing: String { openingAndClosing.closing }
+
+    /// Whether or not extended pound syntax e.g `##/.../##` is allowed with
+    /// this delimiter.
+    var allowsExtendedPoundSyntax: Bool {
+      switch self {
+      case .forwardSlash:
+        return true
+      case .experimental, .reSingleQuote, .rxSingleQuote:
+        return false
+      }
+    }
+  }
+}
+
 struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
     case unterminated
@@ -120,16 +151,25 @@ fileprivate struct DelimiterLexer {
     precondition(cursor <= end, "Cannot advance past end")
   }
 
-  /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
-  func canEat(_ utf8: String.UTF8View) -> Bool {
-    guard let slice = slice(utf8.count) else { return false }
-    return slice.elementsEqual(utf8)
+  /// Check to see if a byte sequence can be eaten from the current cursor.
+  func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
+    guard let slice = slice(bytes.count) else { return false }
+    return slice.elementsEqual(bytes)
+  }
+
+  /// Attempt to eat a byte sequence, returning `true` if successful.
+  mutating func tryEat<C : Collection>(
+    _ bytes: C
+  ) -> Bool where C.Element == UInt8 {
+    guard canEat(bytes) else { return false }
+    advanceCursor(bytes.count)
+    return true
   }
 
-  /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
-  mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
-    guard canEat(utf8) else { return false }
-    advanceCursor(utf8.count)
+  /// Attempt to eat an ascii scalar, returning `true` if successful.
+  mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
+    guard load() == ascii(s) else { return false }
+    advanceCursor()
     return true
   }
 
@@ -137,8 +177,8 @@ fileprivate struct DelimiterLexer {
   /// the actual closing delimiter.
   mutating func trySkipDelimiter(_ delimiter: Delimiter) {
     // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
-    switch delimiter {
-    case .traditional, .experimental:
+    switch delimiter.kind {
+    case .forwardSlash, .experimental:
       return
     case .reSingleQuote, .rxSingleQuote:
       break
@@ -272,16 +312,42 @@ fileprivate struct DelimiterLexer {
     }
   }
 
+  mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
+    for kind in Delimiter.Kind.allCases {
+      // If the delimiter allows extended pound syntax, or there are no pounds,
+      // we just need to lex it.
+      let opening = kind.opening.utf8
+      if kind.allowsExtendedPoundSyntax || poundCount == 0 {
+        guard tryEat(opening) else { continue }
+        return Delimiter(kind, poundCount: poundCount)
+      }
+
+      // The delimiter doesn't allow extended pound syntax, so the pounds must be
+      // part of the delimiter.
+      guard
+        poundCount < opening.count,
+        opening.prefix(poundCount)
+          .elementsEqual(repeatElement(ascii("#"), count: poundCount)),
+        tryEat(opening.dropFirst(poundCount))
+      else { continue }
+
+      return Delimiter(kind, poundCount: 0)
+    }
+    return nil
+  }
+
   /*consuming*/ mutating func lex(
   ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+    // We can consume any number of pound signs.
+    var poundCount = 0
+    while tryEat(ascii: "#") {
+      poundCount += 1
+    }
 
     // Try to lex the opening delimiter.
-    guard let delimiter = Delimiter.allCases.first(
-      where: { tryEat($0.opening.utf8) }
-    ) else {
+    guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
       throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
     }
-
     let contentsStart = cursor
     while true {
       // Check to see if we're at a character that looks like a delimiter, but
@@ -302,20 +368,34 @@ fileprivate struct DelimiterLexer {
 /// Drop a set of regex delimiters from the input string, returning the contents
 /// and the delimiter used. The input string must have valid delimiters.
 func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  func stripDelimiter(_ delim: Delimiter) -> String? {
+  func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
+    var slice = str.utf8[...]
+
+    // Try strip any number of opening '#'s.
+    var poundCount = 0
+    if kind.allowsExtendedPoundSyntax {
+      poundCount = slice.prefix(while: {
+        $0 == UInt8(("#" as UnicodeScalar).value)
+      }).count
+      slice = slice.dropFirst(poundCount)
+    }
+
     // The opening delimiter must match.
-    guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+    guard var slice = slice.tryDropPrefix(kind.opening.utf8)
     else { return nil }
 
     // The closing delimiter may optionally match, as it may not be present in
     // invalid code.
+    let delim = Delimiter(kind, poundCount: poundCount)
     if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
       slice = newSlice
     }
-    return String(slice)
+    let result = String(decoding: slice, as: UTF8.self)
+    precondition(result.utf8.elementsEqual(slice))
+    return (result, delim)
   }
-  for d in Delimiter.allCases {
-    if let contents = stripDelimiter(d) {
+  for kind in Delimiter.Kind.allCases {
+    if let (contents, d) = stripDelimiter(kind) {
       return (contents, d)
     }
   }
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index c50191d05..d11be6c34 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -101,26 +101,31 @@ extension RegexTests {
 
 
   func testCompilerInterface() {
+    func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter {
+      Delimiter(kind, poundCount: poundCount)
+    }
     let testCases: [(String, (String, Delimiter)?)] = [
-      ("#/abc/#", ("abc", .traditional)),
-      ("#|abc|#", ("abc", .experimental)),
+      ("/abc/", ("abc", delim(.forwardSlash))),
+      ("#/abc/#", ("abc", delim(.forwardSlash, poundCount: 1))),
+      ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
+      ("#|abc|#", ("abc", delim(.experimental))),
 
       // TODO: Null characters are lexically valid, similar to string literals,
       // but we ought to warn the user about them.
-      ("#|ab\0c|#", ("ab\0c", .experimental)),
+      ("#|ab\0c|#", ("ab\0c", delim(.experimental))),
       ("'abc'", nil),
-      ("#/abc/def/#", ("abc/def", .traditional)),
-      ("#|abc|def|#", ("abc|def", .experimental)),
-      ("#/abc\\/#def/#", ("abc\\/#def", .traditional)),
-      ("#|abc\\|#def|#", ("abc\\|#def", .experimental)),
-      ("#/abc|#def/#", ("abc|#def", .traditional)),
-      ("#|abc/#def|#", ("abc/#def", .experimental)),
+      ("#/abc/def/#", ("abc/def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc|def|#", ("abc|def", delim(.experimental))),
+      ("#/abc\\/#def/#", ("abc\\/#def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc\\|#def|#", ("abc\\|#def", delim(.experimental))),
+      ("#/abc|#def/#", ("abc|#def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc/#def|#", ("abc/#def", delim(.experimental))),
       ("#/abc|#def/", nil),
       ("#|abc/#def#", nil),
       ("#/abc\n/#", nil),
       ("#/abc\r/#", nil),
 
-      (#"re'abcre\''"#, (#"abcre\'"#, .reSingleQuote)),
+      (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
       (#"re'\'"#, nil)
     ]
 
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b185234a0..c4f13ffd9 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1736,7 +1736,9 @@ extension RegexTests {
 
     // MARK: Parse with delimiters
 
+    parseWithDelimitersTest("/a b/", concat("a", " ", "b"))
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
+    parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
     parseWithDelimitersTest("#|a b|#", concat("a", "b"))
 
     parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
@@ -1773,6 +1775,11 @@ extension RegexTests {
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
 
+    // Make sure we can handle a combining accent as first character.
+    parseWithDelimitersTest("/\u{301}/", "\u{301}")
+
+    delimiterLexingTest("/a/#", ignoreTrailing: true)
+
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
     // if it's clear that it's part of the regex syntax.
 
@@ -2302,6 +2309,11 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
     delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
     delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
+
+    // MARK: Unbalanced extended syntax
+    delimiterLexingDiagnosticTest("#/a/", .unterminated)
+    delimiterLexingDiagnosticTest("##/a/#", .unterminated)
+
   }
 
   func testlibswiftDiagnostics() {

From 9f42ea4ce07194030e63ec104438a0bf4d9e12bd Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:53 +0100
Subject: [PATCH 6/7] Introduce a multi-line literal mode

When an extended delimiter `#/` is followed by a
newline, enter a multi-line mode where the literal
may span multiple lines, and extended syntax is
enabled by default.
---
 .../Regex/Parse/DelimiterLexing.swift         |  67 +++++++--
 .../Regex/Parse/Diagnostics.swift             |   4 +
 .../Regex/Parse/LexicalAnalysis.swift         |   8 +-
 .../_RegexParser/Regex/Parse/Mocking.swift    |   4 +-
 Sources/_RegexParser/Regex/Parse/Parse.swift  |  46 +++++--
 .../Regex/Parse/SyntaxOptions.swift           |   5 +
 Tests/RegexTests/LexTests.swift               |   5 +
 Tests/RegexTests/ParseTests.swift             | 127 ++++++++++++++++++
 8 files changed, 239 insertions(+), 27 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index fa6ca978a..a9f92ade3 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -9,8 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: mock up multi-line soon
-
 struct Delimiter: Hashable {
   let kind: Kind
   let poundCount: Int
@@ -28,13 +26,13 @@ struct Delimiter: Hashable {
     kind.closing + String(repeating: "#", count: poundCount)
   }
 
-  /// The default set of syntax options that the delimiter indicates.
-  var defaultSyntaxOptions: SyntaxOptions {
+  /// Whether or not multi-line mode is permitted.
+  var allowsMultiline: Bool {
     switch kind {
-    case .forwardSlash, .reSingleQuote:
-      return .traditional
-    case .experimental, .rxSingleQuote:
-      return .experimental
+    case .forwardSlash:
+      return poundCount > 0
+    case .experimental, .reSingleQuote, .rxSingleQuote:
+      return false
     }
   }
 }
@@ -76,6 +74,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
     case unprintableASCII
+    case multilineClosingNotOnNewline
   }
 
   var kind: Kind
@@ -94,6 +93,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
     case .unprintableASCII: return "unprintable ASCII character found in source file"
+    case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
     }
   }
 }
@@ -103,6 +103,9 @@ fileprivate struct DelimiterLexer {
   var cursor: UnsafeRawPointer
   let end: UnsafeRawPointer
 
+  var firstNewline: UnsafeRawPointer?
+  var isMultiline: Bool { firstNewline != nil }
+
   init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
     precondition(start <= end)
     self.start = start
@@ -262,12 +265,23 @@ fileprivate struct DelimiterLexer {
     let contentsEnd = cursor
     guard tryEat(delimiter.closing.utf8) else { return nil }
 
-    // Form a string from the contents and make sure it's valid UTF-8.
     let count = contentsEnd - contentsStart
     let contents = UnsafeRawBufferPointer(
       start: contentsStart, count: count)
-    let s = String(decoding: contents, as: UTF8.self)
 
+    // In multi-line mode, we must be on a new line. So scan backwards and make
+    // sure we only have whitespace until the newline.
+    if isMultiline {
+      let idx = contents.lastIndex(
+        where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
+      guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
+      else {
+        throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
+      }
+    }
+
+    // Form a string from the contents and make sure it's valid UTF-8.
+    let s = String(decoding: contents, as: UTF8.self)
     guard s.utf8.elementsEqual(contents) else {
       throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
     }
@@ -278,7 +292,10 @@ fileprivate struct DelimiterLexer {
   /// the end of the buffer is reached.
   mutating func advance(escaped: Bool = false) throws {
     guard let next = load() else {
-      throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      // We've hit the end of the buffer. In multi-line mode, we don't want to
+      // skip over what is likely otherwise valid Swift code, so resume from the
+      // first newline.
+      throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
     }
     switch UnicodeScalar(next) {
     case let next where !next.isASCII:
@@ -289,7 +306,10 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
 
     case "\n", "\r":
-      throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      guard isMultiline else {
+        throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      }
+      advanceCursor()
 
     case "\0":
       // TODO: Warn to match the behavior of String literal lexer? Or should
@@ -301,8 +321,12 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
       try advance(escaped: true)
 
-    case let next where !next.isPrintableASCII:
+    case let next
+      where !next.isPrintableASCII && !(isMultiline && next == "\t"):
       // Diagnose unprintable ASCII.
+      // Note that tabs are allowed in multi-line literals.
+      // TODO: This matches the string literal behavior, but should we allow
+      // tabs for single-line regex literals too?
       // TODO: Ideally we would recover and continue to lex until the ending
       // delimiter.
       throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
@@ -349,6 +373,23 @@ fileprivate struct DelimiterLexer {
       throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
     }
     let contentsStart = cursor
+
+    // If the delimiter allows multi-line, try skipping over any whitespace to a
+    // newline character. If we can do that, we enter multi-line mode.
+    if delimiter.allowsMultiline {
+      while let next = load() {
+        switch next {
+        case ascii(" "), ascii("\t"):
+          advanceCursor()
+          continue
+        case ascii("\n"), ascii("\r"):
+          firstNewline = cursor
+        default:
+          break
+        }
+        break
+      }
+    }
     while true {
       // Check to see if we're at a character that looks like a delimiter, but
       // likely isn't. In such a case, we can attempt to skip over it.
diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
index d4c809045..621d6ea11 100644
--- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
+++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -70,6 +70,8 @@ enum ParseError: Error, Hashable {
 
   case cannotRemoveTextSegmentOptions
   case cannotRemoveSemanticsOptions
+  case cannotRemoveExtendedSyntaxInMultilineMode
+
   case expectedCalloutArgument
 }
 
@@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible {
       return "text segment mode cannot be unset, only changed"
     case .cannotRemoveSemanticsOptions:
       return "semantic level cannot be unset, only changed"
+    case .cannotRemoveExtendedSyntaxInMultilineMode:
+      return "extended syntax may not be disabled in multi-line mode"
     case .expectedCalloutArgument:
       return "expected argument to callout"
     }
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index 165e97d1a..c48d53de9 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -657,6 +657,7 @@ extension Source {
   ///                        | MatchingOption* '-' MatchingOption*
   ///
   mutating func lexMatchingOptionSequence(
+    context: ParsingContext
   ) throws -> AST.MatchingOptionSequence? {
     let ateCaret = recordLoc { $0.tryEat("^") }
 
@@ -691,6 +692,11 @@ extension Source {
         if opt.isSemanticMatchingLevel {
           throw ParseError.cannotRemoveSemanticsOptions
         }
+        // Extended syntax may not be removed if in multi-line mode.
+        if context.syntax.contains(.multilineExtendedSyntax) &&
+            opt.isAnyExtended {
+          throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
+        }
         removing.append(opt)
       }
       return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
@@ -864,7 +870,7 @@ extension Source {
           }
 
           // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
-          if let seq = try src.lexMatchingOptionSequence() {
+          if let seq = try src.lexMatchingOptionSequence(context: context) {
             if src.tryEat(":") {
               return .changeMatchingOptions(seq, isIsolated: false)
             }
diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift
index 596a59bf4..dd02e0fc7 100644
--- a/Sources/_RegexParser/Regex/Parse/Mocking.swift
+++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift
@@ -62,8 +62,8 @@ func libswiftLexRegexLiteral(
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
     switch error.kind {
-    case .unterminated:
-      // Missing closing delimiter can be recovered from.
+    case .unterminated, .multilineClosingNotOnNewline:
+      // These can be recovered from.
       return false
     case .unprintableASCII, .invalidUTF8:
       // We don't currently have good recovery behavior for these.
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 2512f9bf2..c3aa3500b 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -288,22 +288,25 @@ extension Parser {
   ) throws -> AST.Group {
     context.recordGroup(kind.value)
 
-    // Check if we're introducing or removing extended syntax.
+    // Check if we're introducing or removing extended syntax. We skip this for
+    // multi-line, as extended syntax is always enabled there.
     // TODO: PCRE differentiates between (?x) and (?xx) where only the latter
     // handles non-semantic whitespace in a custom character class. Other
     // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
     // treat (?x) and (?xx) as the same option here. If we ever get a strict
     // PCRE mode, we will need to change this to handle that.
     let currentSyntax = context.syntax
-    if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
-      if c.resetsCurrentOptions {
-        context.syntax.remove(.extendedSyntax)
-      }
-      if c.adding.contains(where: \.isAnyExtended) {
-        context.syntax.insert(.extendedSyntax)
-      }
-      if c.removing.contains(where: \.isAnyExtended) {
-        context.syntax.remove(.extendedSyntax)
+    if !context.syntax.contains(.multilineExtendedSyntax) {
+      if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
+        if c.resetsCurrentOptions {
+          context.syntax.remove(.extendedSyntax)
+        }
+        if c.adding.contains(where: \.isAnyExtended) {
+          context.syntax.insert(.extendedSyntax)
+        }
+        if c.removing.contains(where: \.isAnyExtended) {
+          context.syntax.remove(.extendedSyntax)
+        }
       }
     }
     defer {
@@ -532,11 +535,32 @@ public func parse<S: StringProtocol>(
   return try parser.parse()
 }
 
+/// Retrieve the default set of syntax options that a delimiter and literal
+/// contents indicates.
+fileprivate func defaultSyntaxOptions(
+  _ delim: Delimiter, contents: String
+) -> SyntaxOptions {
+  switch delim.kind {
+  case .forwardSlash:
+    // For an extended syntax forward slash e.g #/.../#, extended syntax is
+    // permitted if it spans multiple lines.
+    if delim.poundCount > 0 &&
+        contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
+      return .multilineExtendedSyntax
+    }
+    return .traditional
+  case .reSingleQuote:
+    return .traditional
+  case .experimental, .rxSingleQuote:
+    return .experimental
+  }
+}
+
 /// Parse a given regex string with delimiters, inferring the syntax options
 /// from the delimiter used.
 public func parseWithDelimiters<S: StringProtocol>(
   _ regex: S
 ) throws -> AST where S.SubSequence == Substring {
   let (contents, delim) = droppingRegexDelimiters(String(regex))
-  return try parse(contents, delim.defaultSyntaxOptions)
+  return try parse(contents, defaultSyntaxOptions(delim, contents: contents))
 }
diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
index 5135d8ec1..b7c09ea1c 100644
--- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
+++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
@@ -58,6 +58,11 @@ public struct SyntaxOptions: OptionSet {
   ///  `(_: .*)` == `(?:.*)`
   public static var experimentalCaptures: Self { Self(1 << 5) }
 
+  /// The default syntax for a multi-line regex literal.
+  public static var multilineExtendedSyntax: Self {
+    return [Self(1 << 6), .extendedSyntax]
+  }
+
   /*
 
     /// `<digit>*` == `[[:digit:]]*` == `\d*`
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index d11be6c34..5c304fe58 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -110,6 +110,11 @@ extension RegexTests {
       ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
       ("#|abc|#", ("abc", delim(.experimental))),
 
+      // Multiline
+      ("#/\na\nb\n/#", ("\na\nb\n", delim(.forwardSlash, poundCount: 1))),
+      ("#/ \na\nb\n  /#", (" \na\nb\n  ", delim(.forwardSlash, poundCount: 1))),
+      ("##/ \na\nb\n  /##", (" \na\nb\n  ", delim(.forwardSlash, poundCount: 2))),
+
       // TODO: Null characters are lexically valid, similar to string literals,
       // but we ought to warn the user about them.
       ("#|ab\0c|#", ("ab\0c", delim(.experimental))),
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index c4f13ffd9..c40cb86ca 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -223,6 +223,36 @@ func diagnosticTest(
   }
 }
 
+func diagnosticWithDelimitersTest(
+  _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false,
+  file: StaticString = #file, line: UInt = #line
+) {
+  // First try lexing.
+  let literal = delimiterLexingTest(
+    input, ignoreTrailing: ignoreTrailing, file: file, line: line)
+
+  do {
+    let orig = try parseWithDelimiters(literal)
+    let ast = orig.root
+    XCTFail("""
+
+      Passed \(ast)
+      But expected error: \(expected)
+    """, file: file, line: line)
+  } catch let e as Source.LocatedError<ParseError> {
+    guard e.error == expected else {
+      XCTFail("""
+
+        Expected: \(expected)
+        Actual: \(e.error)
+      """, file: file, line: line)
+      return
+    }
+  } catch let e {
+    XCTFail("Error without source location: \(e)", file: file, line: line)
+  }
+}
+
 func delimiterLexingDiagnosticTest(
   _ input: String, _ expected: DelimiterLexError.Kind,
   syntax: SyntaxOptions = .traditional,
@@ -1403,6 +1433,18 @@ extension RegexTests {
     parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions(
       adding: .extraExtended), isIsolated: true, concat(" ")))
 
+    parseTest(
+      "(?x) a (?^) b",
+      changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true,
+        concat(
+          "a",
+          changeMatchingOptions(
+            unsetMatchingOptions(), isIsolated: true, concat(" ", "b"))
+        )
+      )
+    )
+
     // End of line comments aren't applicable in custom char classes.
     // TODO: ICU supports this.
     parseTest(
@@ -1780,6 +1822,56 @@ extension RegexTests {
 
     delimiterLexingTest("/a/#", ignoreTrailing: true)
 
+    // MARK: Multiline
+
+    parseWithDelimitersTest("#/\n/#", empty())
+    parseWithDelimitersTest("#/\r/#", empty())
+    parseWithDelimitersTest("#/\r\n/#", empty())
+    parseWithDelimitersTest("#/\n\t\t  /#", empty())
+    parseWithDelimitersTest("#/  \t\t\n\t\t  /#", empty())
+
+    parseWithDelimitersTest("#/\n a \n/#", "a")
+    parseWithDelimitersTest("#/\r a \r/#", "a")
+    parseWithDelimitersTest("#/\r\n a \r\n/#", "a")
+    parseWithDelimitersTest("#/\n a \n\t\t  /#", "a")
+    parseWithDelimitersTest("#/\t  \n a \n\t\t  /#", "a")
+
+    parseWithDelimitersTest("""
+      #/
+      a
+        b
+           c
+         /#
+      """, concat("a", "b", "c"))
+
+    parseWithDelimitersTest("""
+      #/
+      a    # comment
+        b # another
+      #
+         /#
+      """, concat("a", "b"))
+
+    // Make sure (?^) is ignored.
+    parseWithDelimitersTest("""
+      #/
+      (?^)
+      # comment
+      /#
+      """, changeMatchingOptions(
+        unsetMatchingOptions(), isIsolated: true, empty())
+    )
+
+    // (?x) has no effect.
+    parseWithDelimitersTest("""
+      #/
+      (?x)
+      # comment
+      /#
+      """, changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true, empty())
+    )
+
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
     // if it's clear that it's part of the regex syntax.
 
@@ -2162,6 +2254,32 @@ extension RegexTests {
     diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions)
     diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions)
 
+    // Extended syntax may not be removed in multi-line mode.
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-x)a b
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-xx)a b
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-x:a b)
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-xx:a b)
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+
     // MARK: Group specifiers
 
     diagnosticTest(#"(*"#, .unknownGroupKind("*"))
@@ -2314,6 +2432,15 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("#/a/", .unterminated)
     delimiterLexingDiagnosticTest("##/a/#", .unterminated)
 
+    // MARK: Multiline
+
+    // Can only be done if pound signs are used.
+    delimiterLexingDiagnosticTest("/\n/", .unterminated)
+
+    // Opening and closing delimiters must be on a newline.
+    delimiterLexingDiagnosticTest("#/a\n/#", .unterminated)
+    delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)
+    delimiterLexingDiagnosticTest("#/\n#/#", .multilineClosingNotOnNewline)
   }
 
   func testlibswiftDiagnostics() {

From 556bca0abb2bd1623664481f9aa31be0ed19af1f Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:53 +0100
Subject: [PATCH 7/7] Disable unused delimiters

Leave only `/.../` (and its extended syntax)
enabled for now.
---
 .../Regex/Parse/DelimiterLexing.swift         | 21 ++++++++++++----
 Tests/RegexTests/LexTests.swift               | 24 +++++++++++++++++--
 Tests/RegexTests/ParseTests.swift             |  6 +++--
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index a9f92ade3..bee782043 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -35,6 +35,12 @@ struct Delimiter: Hashable {
       return false
     }
   }
+
+  /// The delimiters which are currently enabled.
+  static var enabledDelimiters: [Kind] { [.forwardSlash] }
+
+  /// All known delimiters.
+  static var allDelimiters: [Kind] { Kind.allCases }
 }
 
 extension Delimiter {
@@ -106,11 +112,15 @@ fileprivate struct DelimiterLexer {
   var firstNewline: UnsafeRawPointer?
   var isMultiline: Bool { firstNewline != nil }
 
-  init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+  let delimiters: [Delimiter.Kind]
+
+  init(start: UnsafeRawPointer, end: UnsafeRawPointer,
+       delimiters: [Delimiter.Kind]) {
     precondition(start <= end)
     self.start = start
     self.cursor = start
     self.end = end
+    self.delimiters = delimiters
   }
 
   func ascii(_ s: Unicode.Scalar) -> UInt8 {
@@ -337,7 +347,7 @@ fileprivate struct DelimiterLexer {
   }
 
   mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
-    for kind in Delimiter.Kind.allCases {
+    for kind in delimiters {
       // If the delimiter allows extended pound syntax, or there are no pounds,
       // we just need to lex it.
       let opening = kind.opening.utf8
@@ -435,7 +445,7 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
     precondition(result.utf8.elementsEqual(slice))
     return (result, delim)
   }
-  for kind in Delimiter.Kind.allCases {
+  for kind in Delimiter.allDelimiters {
     if let (contents, d) = stripDelimiter(kind) {
       return (contents, d)
     }
@@ -446,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
 /// Attempt to lex a regex literal between `start` and `end`, returning either
 /// the contents and pointer from which to resume lexing, or an error.
 func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
+  start: UnsafeRawPointer, end: UnsafeRawPointer,
+  delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters
 ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  var lexer = DelimiterLexer(start: start, end: end)
+  var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters)
   return try lexer.lex()
 }
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index 5c304fe58..958c53c26 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -100,7 +100,7 @@ extension RegexTests {
   }
 
 
-  func testCompilerInterface() {
+  func testCompilerInterface() throws {
     func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter {
       Delimiter(kind, poundCount: poundCount)
     }
@@ -138,7 +138,9 @@ extension RegexTests {
       input.withCString {
         let endPtr = $0 + input.utf8.count
         assert(endPtr.pointee == 0)
-        guard let out = try? lexRegex(start: $0, end: endPtr) else {
+        guard let out = try? lexRegex(
+          start: $0, end: endPtr, delimiters: Delimiter.allDelimiters)
+        else {
           XCTAssertNil(expected)
           return
         }
@@ -150,5 +152,23 @@ extension RegexTests {
         XCTAssertEqual(expected?.1, droppedDelimiters.1)
       }
     }
+
+    // TODO: Remove the lexing code for these if we no longer need them.
+    let disabledDelimiters: [String] = [
+      "#|x|#", "re'x'", "rx'y'"
+    ]
+
+    for input in disabledDelimiters {
+      try input.withCString {
+        let endPtr = $0 + input.utf8.count
+        assert(endPtr.pointee == 0)
+        do {
+          _ = try lexRegex(start: $0, end: endPtr)
+          XCTFail()
+        } catch let e as DelimiterLexError {
+          XCTAssertEqual(e.kind, .unknownDelimiter)
+        }
+      }
+    }
   }
 }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index c40cb86ca..c6ff3e46d 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -117,7 +117,8 @@ func delimiterLexingTest(
 ) -> String {
   input.withCString(encodedAs: UTF8.self) { ptr in
     let endPtr = ptr + input.utf8.count
-    let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
+    let (contents, delim, end) = try! lexRegex(
+      start: ptr, end: endPtr, delimiters: Delimiter.allDelimiters)
     if ignoreTrailing {
       XCTAssertNotEqual(end, endPtr, file: file, line: line)
     } else {
@@ -260,7 +261,8 @@ func delimiterLexingDiagnosticTest(
 ) {
   do {
     _ = try input.withCString { ptr in
-      try lexRegex(start: ptr, end: ptr + input.count)
+      try lexRegex(
+        start: ptr, end: ptr + input.count, delimiters: Delimiter.allDelimiters)
     }
     XCTFail("""
       Passed, but expected error: \(expected)