From d3bd6ad9544e3bcfd7f84ad8c2afe16517bf604d Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Mon, 21 Mar 2022 19:30:32 +0000
Subject: [PATCH 01/17] Error on unknown escape sequences

Throw an error for unknown a-z escape sequences
as well as non-ASCII non-whitespace escape
sequences.
---
 .../Regex/Parse/Diagnostics.swift             |  3 ++
 .../Regex/Parse/LexicalAnalysis.swift         | 11 ++++-
 Tests/RegexTests/ParseTests.swift             | 42 ++++++++++++++-----
 3 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
index b9c99d9d3..d4c809045 100644
--- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
+++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -39,6 +39,7 @@ enum ParseError: Error, Hashable {
 
   case expectedNonEmptyContents
   case expectedEscape
+  case invalidEscape(Character)
 
   case cannotReferToWholePattern
 
@@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible {
       return "expected non-empty contents"
     case .expectedEscape:
       return "expected escape sequence"
+    case .invalidEscape(let c):
+      return "invalid escape sequence '\\\(c)'"
     case .cannotReferToWholePattern:
       return "cannot refer to whole pattern here"
     case .notQuantifiable:
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index cfab75312..4eb0ebea4 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -1489,8 +1489,17 @@ extension Source {
         return try .scalar(
           src.expectUnicodeScalar(escapedCharacter: char).value)
       default:
-        return .char(char)
+        break
       }
+
+      // We only allow unknown escape sequences for non-letter ASCII, and
+      // non-ASCII whitespace.
+      guard (char.isASCII && !char.isLetter) ||
+              (!char.isASCII && char.isWhitespace)
+      else {
+        throw ParseError.invalidEscape(char)
+      }
+      return .char(char)
     }
   }
 
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 6e511767a..69a3a785b 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -544,9 +544,8 @@ extension RegexTests {
       #"a\Q \Q \\.\Eb"#,
       concat("a", quote(#" \Q \\."#), "b"))
 
-    // These follow the PCRE behavior.
+    // This follows the PCRE behavior.
     parseTest(#"\Q\\E"#, quote("\\"))
-    parseTest(#"\E"#, "E")
 
     parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
               syntax: .experimental)
@@ -566,6 +565,16 @@ extension RegexTests {
 
     parseTest(#"["-"]"#, charClass(range_m("\"", "\"")))
 
+    // MARK: Escapes
+
+    // Not metachars, but we allow their escape as ASCII.
+    parseTest(#"\<"#, "<")
+    parseTest(#"\ "#, " ")
+    parseTest(#"\\"#, "\\")
+
+    // Escaped U+3000 IDEOGRAPHIC SPACE.
+    parseTest(#"\\#u{3000}"#, "\u{3000}")
+
     // MARK: Comments
 
     parseTest(
@@ -989,13 +998,6 @@ extension RegexTests {
     // Backreferences are not valid in custom character classes.
     parseTest(#"[\8]"#, charClass("8"))
     parseTest(#"[\9]"#, charClass("9"))
-    parseTest(#"[\g]"#, charClass("g"))
-    parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0"))
-    parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}"))
-    parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'"))
-
-    parseTest(#"\g"#, atom(.char("g")))
-    parseTest(#"\k"#, atom(.char("k")))
 
     // MARK: Character names.
 
@@ -1526,7 +1528,7 @@ extension RegexTests {
     parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
 
     parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
-    parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", "✅"))
+    parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", "✅"))
 
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
@@ -1875,6 +1877,26 @@ extension RegexTests {
 
     diagnosticTest("\\", .expectedEscape)
 
+    // TODO: Custom diagnostic for expected backref
+    diagnosticTest(#"\g"#, .invalidEscape("g"))
+    diagnosticTest(#"\k"#, .invalidEscape("k"))
+
+    // TODO: Custom diagnostic for backref in custom char class
+    diagnosticTest(#"[\g]"#, .invalidEscape("g"))
+    diagnosticTest(#"[\g+30]"#, .invalidEscape("g"))
+    diagnosticTest(#"[\g{1}]"#, .invalidEscape("g"))
+    diagnosticTest(#"[\k'a']"#, .invalidEscape("k"))
+
+    // TODO: Custom diagnostic for missing '\Q'
+    diagnosticTest(#"\E"#, .invalidEscape("E"))
+
+    // Non-ASCII non-whitespace cases.
+    diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
+    diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
+    diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}"))
+    diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
+    diagnosticTest(#"\˂"#, .invalidEscape("˂"))
+
     // MARK: Text Segment options
 
     diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

From 5a52d531097d3e67daf2c7af3a863d66aaf6388f Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Mon, 21 Mar 2022 19:30:32 +0000
Subject: [PATCH 02/17] Allow certain escape sequences in character class
 ranges

Certain escape sequences express a unicode scalar
and as such are valid in a range.
---
 Sources/_RegexParser/Regex/AST/Atom.swift    | 60 ++++++++++++++++++--
 Sources/_RegexParser/Regex/Parse/Parse.swift |  5 +-
 Tests/RegexTests/MatchTests.swift            | 29 ++++++++++
 Tests/RegexTests/ParseTests.swift            | 31 ++++++++++
 4 files changed, 118 insertions(+), 7 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
index bc346469b..0aa0951c5 100644
--- a/Sources/_RegexParser/Regex/AST/Atom.swift
+++ b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -641,17 +641,67 @@ extension AST.Atom {
     case .scalar(let s):
       return Character(s)
 
+    case .escaped(let c):
+      switch c {
+      // TODO: Should we separate these into a separate enum? Or move the
+      // specifics of the scalar to the DSL tree?
+      case .alarm:
+        return "\u{7}"
+      case .backspace:
+        return "\u{8}"
+      case .escape:
+        return "\u{1B}"
+      case .formfeed:
+        return "\u{C}"
+      case .newline:
+        return "\n"
+      case .carriageReturn:
+        return "\r"
+      case .tab:
+        return "\t"
+
+      case .singleDataUnit, .decimalDigit, .notDecimalDigit,
+          .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
+          .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
+          .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
+          .wordBoundary, .notWordBoundary, .startOfSubject,
+          .endOfSubjectBeforeNewline, .endOfSubject,
+          .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
+          .textSegment, .notTextSegment:
+        return nil
+      }
+
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
-      // TODO: Not a character per-say, what should we do?
-      fallthrough
+      // TODO: These should have unicode scalar values.
+      return nil
 
-    case .property, .escaped, .any, .startOfLine, .endOfLine,
-        .backreference, .subpattern, .namedCharacter, .callout,
-        .backtrackingDirective:
+    case .namedCharacter:
+      // TODO: This should have a unicode scalar value depending on the name
+      // given.
+      // TODO: Do we want to validate and assign a scalar value when building
+      // the AST? Or defer for the matching engine?
+      return nil
+
+    case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
+        .callout, .backtrackingDirective:
       return nil
     }
   }
 
+  /// Whether this atom is valid as the operand of a custom character class
+  /// range.
+  public var isValidCharacterClassRangeBound: Bool {
+    // If we have a literal character value for this, it can be used as a bound.
+    if literalCharacterValue != nil { return true }
+    switch kind {
+    // \cx, \C-x, \M-x, \M-\C-x, \N{...}
+    case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
+      return true
+    default:
+      return false
+    }
+  }
+
   /// Produce a string literal representation of the atom, if possible
   ///
   /// Individual characters will be returned, Unicode scalars will be
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 296956fdc..4481cf602 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -489,10 +489,11 @@ extension Parser {
       // Range between atoms.
       if let (dashLoc, rhs) =
           try source.lexCustomCharClassRangeEnd(context: context) {
-        guard atom.literalCharacterValue != nil &&
-              rhs.literalCharacterValue != nil else {
+        guard atom.isValidCharacterClassRangeBound &&
+              rhs.isValidCharacterClassRangeBound else {
           throw ParseError.invalidCharacterClassRangeOperand
         }
+        // TODO: Validate lower <= upper?
         members.append(.range(.init(atom, dashLoc, rhs)))
         continue
       }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 52db17aa7..67412d262 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -594,6 +594,35 @@ extension RegexTests {
 
     firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")
 
+    func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
+
+    // Currently not supported in the matching engine.
+    for s in scalar("\u{C}") ... scalar("\u{1B}") {
+      let u = UnicodeScalar(s)!
+      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
+                     xfail: true)
+    }
+    for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
+      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
+                     xfail: true)
+    }
+    for s in scalar("\u{A}") ... scalar("\u{D}") {
+      let u = UnicodeScalar(s)!
+      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
+                     xfail: true)
+    }
+    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
+                   xfail: true)
+
+    for c: UnicodeScalar in ["a", "b", "c"] {
+      firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
+                     xfail: true)
+    }
+    for c: UnicodeScalar in ["$", "%", "&", "'"] {
+      firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#,
+                     input: "#()\(c)", match: "\(c)", xfail: true)
+    }
+
     // MARK: Operators
 
     firstMatchTest(
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 69a3a785b..76327ac64 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -494,6 +494,25 @@ extension RegexTests {
     parseTest("[*]", charClass("*"))
     parseTest("[{0}]", charClass("{", "0", "}"))
 
+    parseTest(#"[\f-\e]"#, charClass(
+      range_m(.escaped(.formfeed), .escaped(.escape))))
+    parseTest(#"[\a-\b]"#, charClass(
+      range_m(.escaped(.alarm), .escaped(.backspace))))
+    parseTest(#"[\n-\r]"#, charClass(
+      range_m(.escaped(.newline), .escaped(.carriageReturn))))
+    parseTest(#"[\t-\t]"#, charClass(
+      range_m(.escaped(.tab), .escaped(.tab))))
+
+    parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass(
+      range_m(.keyboardControl("X"), .keyboardControl("Y")),
+      range_m(.keyboardControl("A"), .keyboardControl("B")),
+      range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")),
+      range_m(.keyboardMeta("A"), .keyboardMeta("B"))
+    ))
+
+    parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass(
+      range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))))
+
     // MARK: Operators
 
     parseTest(
@@ -575,6 +594,15 @@ extension RegexTests {
     // Escaped U+3000 IDEOGRAPHIC SPACE.
     parseTest(#"\\#u{3000}"#, "\u{3000}")
 
+    // Control and meta controls.
+    parseTest(#"\c "#, atom(.keyboardControl(" ")))
+    parseTest(#"\c!"#, atom(.keyboardControl("!")))
+    parseTest(#"\c~"#, atom(.keyboardControl("~")))
+    parseTest(#"\C--"#, atom(.keyboardControl("-")))
+    parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")))
+    parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")))
+    parseTest(#"\M-a"#, atom(.keyboardMeta("a")))
+
     // MARK: Comments
 
     parseTest(
@@ -1877,6 +1905,9 @@ extension RegexTests {
 
     diagnosticTest("\\", .expectedEscape)
 
+    // TODO: Custom diagnostic for control sequence
+    diagnosticTest(#"\c"#, .unexpectedEndOfInput)
+
     // TODO: Custom diagnostic for expected backref
     diagnosticTest(#"\g"#, .invalidEscape("g"))
     diagnosticTest(#"\k"#, .invalidEscape("k"))

From 692f0fd15bbced7f347ed8b99021d9ad45148369 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Mon, 21 Mar 2022 19:30:33 +0000
Subject: [PATCH 03/17] Remove obsolete CharacterClass model computation

This is now done from the DSLTree.
---
 .../_StringProcessing/CharacterClass.swift    | 75 -------------------
 1 file changed, 75 deletions(-)

diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift
index 0b95e08b4..d44fa9fb2 100644
--- a/Sources/_StringProcessing/CharacterClass.swift
+++ b/Sources/_StringProcessing/CharacterClass.swift
@@ -319,21 +319,6 @@ extension CharacterClass {
   }
 }
 
-extension AST.Node {
-  /// If this has a character class representation, whether built-in or custom, return it.
-  ///
-  /// TODO: Not sure if this the right model type, but I suspect we'll want to produce
-  /// something like this on demand
-  var characterClass: CharacterClass? {
-    switch self {
-    case let .customCharacterClass(cc): return cc.modelCharacterClass
-    case let .atom(a): return a.characterClass
-
-    default: return nil
-    }
-  }
-}
-
 extension DSLTree.Node {
   var characterClass: CharacterClass? {
     switch self {
@@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass {
   }
 }
 
-extension AST.CustomCharacterClass {
-  /// The model character class for this custom character class.
-  var modelCharacterClass: CharacterClass? {
-    typealias Component = CharacterClass.CharacterSetComponent
-    func getComponents(_ members: [Member]) -> [Component]? {
-      var result = Array<Component>()
-      for m in members {
-        switch m {
-        case .custom(let cc):
-          guard let cc = cc.modelCharacterClass else {
-            return nil
-          }
-          result.append(.characterClass(cc))
-        case .range(let r):
-          result.append(.range(
-            r.lhs.literalCharacterValue! ...
-            r.rhs.literalCharacterValue!))
-
-        case .atom(let a):
-          if let cc = a.characterClass {
-            result.append(.characterClass(cc))
-          } else if let lit = a.literalCharacterValue {
-            result.append(.character(lit))
-          } else {
-            return nil
-          }
-
-        case .quote(let q):
-          // Decompose quoted literal into literal characters.
-          result += q.literal.map { .character($0) }
-
-        case .trivia:
-          // Not semantically important.
-          break
-
-        case .setOperation(let lhs, let op, let rhs):
-          // FIXME: CharacterClass wasn't designed for set operations with
-          // multiple components in each operand, we should fix that. For now,
-          // just produce custom components.
-          guard let lhs = getComponents(lhs),
-                let rhs = getComponents(rhs)
-          else {
-            return nil
-          }
-          result.append(.setOperation(.init(
-            lhs: .characterClass(.custom(lhs)),
-            op: op.value,
-            rhs: .characterClass(.custom(rhs)))))
-        }
-      }
-      return result
-    }
-    guard let comps = getComponents(members) else {
-      return nil
-    }
-    let cc = CharacterClass.custom(comps)
-    return self.isInverted ? cc.inverted : cc
-  }
-}
-
 extension CharacterClass {
   // FIXME: Calling on inverted sets wont be the same as the
   // inverse of a boundary if at the start or end of the

From cdf98c5f94bf159450015cc72e675a0930b9dd36 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Mon, 21 Mar 2022 19:30:33 +0000
Subject: [PATCH 04/17] Forbid empty character classes

As per PCRE, Oniguruma, and ICU, a first character
of `]` is treated as literal.
---
 Sources/_RegexParser/Regex/Parse/Parse.swift | 6 ++++++
 Tests/RegexTests/ParseTests.swift            | 8 ++++++++
 2 files changed, 14 insertions(+)

diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 4481cf602..7867073e6 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -425,6 +425,12 @@ extension Parser {
     try source.expectNonEmpty()
 
     var members: Array<Member> = []
+
+    // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
+    // character classes, and assume an initial ']' is literal.
+    if let loc = source.tryEatWithLoc("]") {
+      members.append(.atom(.init(.char("]"), loc)))
+    }
     try parseCCCMembers(into: &members)
 
     // If we have a binary set operator, parse it and the next members. Note
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 76327ac64..f6f31c075 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -428,6 +428,10 @@ extension RegexTests {
 
     parseTest("[-]", charClass("-"))
 
+    // Empty character classes are forbidden, therefore this is a character
+    // class of literal ']'.
+    parseTest("[]]", charClass("]"))
+
     // These are metacharacters in certain contexts, but normal characters
     // otherwise.
     parseTest(
@@ -1901,6 +1905,10 @@ extension RegexTests {
     diagnosticTest("(?<a-b", .expected(">"))
     diagnosticTest("(?<a-b>", .expected(")"))
 
+    // The first ']' of a custom character class is literal, so this is missing
+    // the closing bracket.
+    diagnosticTest("[]", .expected("]"))
+
     // MARK: Bad escapes
 
     diagnosticTest("\\", .expectedEscape)

From c5ec8be4088b438f893a623fa0266b10ffaf0450 Mon Sep 17 00:00:00 2001
From: Evan Wilde <etceterawilde@gmail.com>
Date: Wed, 30 Mar 2022 22:16:35 -0700
Subject: [PATCH 05/17] Remove extra const from gestScriptExtensions

Returning a constant pointer is extraneous and leads to a bunch of
warnings. Since you don't control where the pointer is assigned you
can't really control whether the pointer is const or not. The uint8_t
inside can be const though.
---
 Sources/_CUnicode/UnicodeScalarProps.c  | 4 ++--
 Sources/_CUnicode/include/UnicodeData.h | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Sources/_CUnicode/UnicodeScalarProps.c b/Sources/_CUnicode/UnicodeScalarProps.c
index 9e5a8890a..81ead1421 100644
--- a/Sources/_CUnicode/UnicodeScalarProps.c
+++ b/Sources/_CUnicode/UnicodeScalarProps.c
@@ -68,8 +68,8 @@ uint8_t _swift_stdlib_getScript(uint32_t scalar) {
 }
 
 SWIFT_CC
-const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar,
-                                                        uint8_t *count) {
+const uint8_t *_swift_stdlib_getScriptExtensions(uint32_t scalar,
+                                                 uint8_t *count) {
   intptr_t dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
                                                 _swift_stdlib_script_extensions,
                                          _swift_stdlib_script_extensions_ranks);
diff --git a/Sources/_CUnicode/include/UnicodeData.h b/Sources/_CUnicode/include/UnicodeData.h
index c9437868c..3ce6e3591 100644
--- a/Sources/_CUnicode/include/UnicodeData.h
+++ b/Sources/_CUnicode/include/UnicodeData.h
@@ -66,6 +66,7 @@ SWIFT_CC
 uint8_t _swift_stdlib_getScript(uint32_t scalar);
 
 SWIFT_CC
-const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar, uint8_t *count);
+const uint8_t *_swift_stdlib_getScriptExtensions(uint32_t scalar,
+                                                 uint8_t *count);
 
 #endif // SWIFT_STDLIB_SHIMS_UNICODEDATA_H

From 0108e22cfc4ec70c2db4ac555f5e92b446b97e87 Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Thu, 31 Mar 2022 10:57:38 -0600
Subject: [PATCH 06/17] DSL support for atomic groups (#238)

---
 Sources/RegexBuilder/DSL.swift                |  12 ++
 Sources/RegexBuilder/Variadics.swift          | 167 ++++++++++++++++++
 .../VariadicsGenerator.swift                  |  61 +++++++
 3 files changed, 240 insertions(+)

diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift
index 457439a43..632f1baba 100644
--- a/Sources/RegexBuilder/DSL.swift
+++ b/Sources/RegexBuilder/DSL.swift
@@ -235,6 +235,18 @@ public struct TryCapture<Output>: _BuiltinRegexComponent {
   // Note: Public initializers are currently gyb'd. See Variadics.swift.
 }
 
+// MARK: - Groups
+
+/// An atomic group, i.e. opens a local backtracking scope which, upon successful exit,
+/// discards any remaining backtracking points from within the scope
+public struct BacktrackingScope<Output>: _BuiltinRegexComponent {
+  public var regex: Regex<Output>
+
+  internal init(_ regex: Regex<Output>) {
+    self.regex = regex
+  }
+}
+
 // MARK: - Backreference
 
 public struct Reference<Capture>: RegexComponent {
diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift
index f59b1f13a..002898dfd 100644
--- a/Sources/RegexBuilder/Variadics.swift
+++ b/Sources/RegexBuilder/Variadics.swift
@@ -1566,6 +1566,173 @@ extension Repeat {
     self.init(node: .repeating(expression.relative(to: 0..<Int.max), behavior, component().regex.root))
   }
 }
+extension BacktrackingScope {
+  @_disfavoredOverload
+  public init<Component: RegexComponent>(
+    _ component: Component
+  ) where Output == Substring {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+  @_disfavoredOverload
+  public init<Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == Substring {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0), Component.Output == (W, C0) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0), Component.Output == (W, C0) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, Component: RegexComponent>(
+    _ component: Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root))
+  }
+}
+
+extension BacktrackingScope {
+    public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, Component: RegexComponent>(
+    @RegexComponentBuilder _ component: () -> Component
+  ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) {
+    self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
+  }
+}
 extension AlternationBuilder {
   public static func buildPartialBlock<R0, R1>(
     accumulated: R0, next: R1
diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift
index ff406e9fb..23a362dad 100644
--- a/Sources/VariadicsGenerator/VariadicsGenerator.swift
+++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift
@@ -155,6 +155,13 @@ struct VariadicsGenerator: ParsableCommand {
       print(to: &standardError)
     }
 
+    print("Generating atomic groups...", to: &standardError)
+    for arity in 0...maxArity {
+      print("  Arity \(arity): ", terminator: "", to: &standardError)
+      emitAtomicGroup(arity: arity)
+      print(to: &standardError)
+    }
+
     print("Generating alternation overloads...", to: &standardError)
     for (leftArity, rightArity) in Permutations(totalArity: maxArity) {
       print(
@@ -393,6 +400,60 @@ struct VariadicsGenerator: ParsableCommand {
 
       """)
   }
+
+
+  func emitAtomicGroup(arity: Int) {
+    assert(arity >= 0)
+    let groupName = "BacktrackingScope"
+    func node(builder: Bool) -> String {
+      """
+      .nonCapturingGroup(.atomicNonCapturing, component\(
+        builder ? "()" : ""
+      ).regex.root)
+      """
+    }
+
+    let disfavored = arity == 0 ? "@_disfavoredOverload\n" : ""
+    let genericParams: String = {
+      var result = ""
+      if arity > 0 {
+        result += "W"
+        result += (0..<arity).map { ", C\($0)" }.joined()
+        result += ", "
+      }
+      result += "Component: \(regexComponentProtocolName)"
+      return result
+    }()
+    let captures = (0..<arity).map { "C\($0)" }
+    let capturesJoined = captures.joined(separator: ", ")
+    let matchType = arity == 0
+      ? baseMatchTypeName
+      : "(\(baseMatchTypeName), \(capturesJoined))"
+    let whereClauseForInit = "where \(outputAssociatedTypeName) == \(matchType)" +
+      (arity == 0 ? "" : ", Component.\(outputAssociatedTypeName) == (W, \(capturesJoined))")
+
+    output("""
+      extension \(groupName) {
+        \(disfavored)\
+        public init<\(genericParams)>(
+          _ component: Component
+        ) \(whereClauseForInit) {
+          self.init(node: \(node(builder: false)))
+        }
+      }
+
+      extension \(groupName) {
+        \(disfavored)\
+        public init<\(genericParams)>(
+          @\(concatBuilderName) _ component: () -> Component
+        ) \(whereClauseForInit) {
+          self.init(node: \(node(builder: true)))
+        }
+      }
+
+      """)
+  }
+
   
   func emitRepeating(arity: Int) {
     assert(arity >= 0)

From 692237f84af785ccb7156b68b2ae6a8a18fe4909 Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Thu, 31 Mar 2022 14:40:55 -0600
Subject: [PATCH 07/17] Rename BacktrackingScope to Local (#239)

---
 Sources/RegexBuilder/DSL.swift                |  2 +-
 Sources/RegexBuilder/Variadics.swift          | 44 +++++++++----------
 .../VariadicsGenerator.swift                  |  2 +-
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift
index 632f1baba..80662be41 100644
--- a/Sources/RegexBuilder/DSL.swift
+++ b/Sources/RegexBuilder/DSL.swift
@@ -239,7 +239,7 @@ public struct TryCapture<Output>: _BuiltinRegexComponent {
 
 /// An atomic group, i.e. opens a local backtracking scope which, upon successful exit,
 /// discards any remaining backtracking points from within the scope
-public struct BacktrackingScope<Output>: _BuiltinRegexComponent {
+public struct Local<Output>: _BuiltinRegexComponent {
   public var regex: Regex<Output>
 
   internal init(_ regex: Regex<Output>) {
diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift
index 002898dfd..989e5d463 100644
--- a/Sources/RegexBuilder/Variadics.swift
+++ b/Sources/RegexBuilder/Variadics.swift
@@ -1566,7 +1566,7 @@ extension Repeat {
     self.init(node: .repeating(expression.relative(to: 0..<Int.max), behavior, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
   @_disfavoredOverload
   public init<Component: RegexComponent>(
     _ component: Component
@@ -1575,7 +1575,7 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
   @_disfavoredOverload
   public init<Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
@@ -1583,7 +1583,7 @@ extension BacktrackingScope {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0), Component.Output == (W, C0) {
@@ -1591,14 +1591,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0), Component.Output == (W, C0) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) {
@@ -1606,14 +1606,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) {
@@ -1621,14 +1621,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) {
@@ -1636,14 +1636,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) {
@@ -1651,14 +1651,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) {
@@ -1666,14 +1666,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) {
@@ -1681,14 +1681,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) {
@@ -1696,14 +1696,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) {
@@ -1711,14 +1711,14 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) {
     self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root))
   }
 }
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, Component: RegexComponent>(
     _ component: Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) {
@@ -1726,7 +1726,7 @@ extension BacktrackingScope {
   }
 }
 
-extension BacktrackingScope {
+extension Local {
     public init<W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, Component: RegexComponent>(
     @RegexComponentBuilder _ component: () -> Component
   ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) {
diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift
index 23a362dad..dbeff818c 100644
--- a/Sources/VariadicsGenerator/VariadicsGenerator.swift
+++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift
@@ -404,7 +404,7 @@ struct VariadicsGenerator: ParsableCommand {
 
   func emitAtomicGroup(arity: Int) {
     assert(arity >= 0)
-    let groupName = "BacktrackingScope"
+    let groupName = "Local"
     func node(builder: Bool) -> String {
       """
       .nonCapturingGroup(.atomicNonCapturing, component\(

From 096d39d4051af8e918c9d8d77554487525ac48d7 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:51 +0100
Subject: [PATCH 08/17] Better filter trivia in dumps

Make sure we don't try and print things like
empty comma lists `,,,,` or redundant parens for
concatenations that had their trivia filtered out.
---
 .../_RegexParser/Regex/Printing/DumpAST.swift | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
index 47142407a..0e40ad2ce 100644
--- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift
+++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift
@@ -44,18 +44,23 @@ extension _ASTPrintable {
     guard let children = _children else {
       return _dumpBase
     }
-    let sub = children.lazy.compactMap {
+    let childDump = children.compactMap { child -> String? in
       // Exclude trivia for now, as we don't want it to appear when performing
       // comparisons of dumped output in tests.
       // TODO: We should eventually have some way of filtering out trivia for
       // tests, so that it can appear in regular dumps.
-      if $0.isTrivia { return nil }
-      return $0._dump()
-    }.joined(separator: ",")
-    if sub.isEmpty {
-      return "\(_dumpBase)"
+      if child.isTrivia { return nil }
+      let dump = child._dump()
+      return !dump.isEmpty ? dump : nil
     }
-    return "\(_dumpBase)(\(sub))"
+    let base = "\(_dumpBase)"
+    if childDump.isEmpty {
+      return base
+    }
+    if childDump.count == 1, base.isEmpty {
+      return "\(childDump[0])"
+    }
+    return "\(base)(\(childDump.joined(separator: ",")))"
   }
 }
 
@@ -77,7 +82,7 @@ extension AST.Node: _ASTPrintable {
 }
 
 extension AST.Alternation {
-  public var _dumpBase: String { "alternation" }
+  public var _dumpBase: String { "alternation<\(children.count)>" }
 }
 
 extension AST.Concatenation {

From c6dc547908bd3aab852e04c47286a651b31d8c00 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:51 +0100
Subject: [PATCH 09/17] Formalize non-semantic whitespace matching

Turns out this is a Unicode-defined thing.
---
 .../Regex/Parse/LexicalAnalysis.swift         | 26 ++++---------------
 .../_RegexParser/Utility/MissingUnicode.swift |  6 +++++
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index 4eb0ebea4..b595f3d29 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -550,28 +550,12 @@ extension Source {
   ) throws -> AST.Trivia? {
     guard context.ignoreWhitespace else { return nil }
 
-    func isWhitespace(_ c: Character) -> Bool {
-      // This is a list of characters that PCRE treats as whitespace when
-      // compiled with Unicode support. It is a subset of the characters with
-      // the `.isWhitespace` property. ICU appears to also follow this list.
-      // Oniguruma and .NET follow a subset of this list.
-      //
-      // FIXME: PCRE only treats space and tab characters as whitespace when
-      // inside a custom character class (and only treats whitespace as
-      // non-semantic there for the extra-extended `(?xx)` mode). If we get a
-      // strict-PCRE mode, we'll need to add a case for that.
-      switch c {
-      case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r
-           "\u{85}", "\u{200E}",   // next line, left-to-right mark
-           "\u{200F}", "\u{2028}", // right-to-left-mark, line separator
-           "\u{2029}":             // paragraph separator
-        return true
-      default:
-        return false
-      }
-    }
+    // FIXME: PCRE only treats space and tab characters as whitespace when
+    // inside a custom character class (and only treats whitespace as
+    // non-semantic there for the extra-extended `(?xx)` mode). If we get a
+    // strict-PCRE mode, we'll need to add a case for that.
     let trivia: Located<String>? = recordLoc { src in
-      src.tryEatPrefix(isWhitespace)?.string
+      src.tryEatPrefix(\.isPatternWhitespace)?.string
     }
     guard let trivia = trivia else { return nil }
     return AST.Trivia(trivia)
diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift
index dccba3286..4d819806b 100644
--- a/Sources/_RegexParser/Utility/MissingUnicode.swift
+++ b/Sources/_RegexParser/Utility/MissingUnicode.swift
@@ -660,6 +660,12 @@ extension Character {
   public var isOctalDigit: Bool { ("0"..."7").contains(self) }
 
   public var isWordCharacter: Bool { isLetter || isNumber || self == "_" }
+
+  /// Whether this character represents whitespace for the purposes of pattern
+  /// parsing.
+  public var isPatternWhitespace: Bool {
+    return unicodeScalars.first!.properties.isPatternWhitespace
+  }
 }
 
 extension UnicodeScalar {

From a96648badd28106b4db723aca44b1f83fa956ffe Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 10/17] Rename endOfString -> unterminated

---
 .../Regex/Parse/DelimiterLexing.swift            |  8 ++++----
 Sources/_RegexParser/Regex/Parse/Mocking.swift   |  2 +-
 Tests/RegexTests/ParseTests.swift                | 16 ++++++++--------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index 1227ade1f..e88c1fa80 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -41,7 +41,7 @@ enum Delimiter: Hashable, CaseIterable {
 
 struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
-    case endOfString
+    case unterminated
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
     case unprintableASCII
@@ -59,7 +59,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
 
   var description: String {
     switch kind {
-    case .endOfString: return "unterminated regex literal"
+    case .unterminated: return "unterminated regex literal"
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
     case .unprintableASCII: return "unprintable ASCII character found in source file"
@@ -238,7 +238,7 @@ fileprivate struct DelimiterLexer {
   /// the end of the buffer is reached.
   mutating func advance(escaped: Bool = false) throws {
     guard let next = load() else {
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      throw DelimiterLexError(.unterminated, resumeAt: cursor)
     }
     switch UnicodeScalar(next) {
     case let next where !next.isASCII:
@@ -249,7 +249,7 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
 
     case "\n", "\r":
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      throw DelimiterLexError(.unterminated, resumeAt: cursor)
 
     case "\0":
       // TODO: Warn to match the behavior of String literal lexer? Or should
diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift
index 5994a4f52..596a59bf4 100644
--- a/Sources/_RegexParser/Regex/Parse/Mocking.swift
+++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift
@@ -62,7 +62,7 @@ func libswiftLexRegexLiteral(
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
     switch error.kind {
-    case .endOfString:
+    case .unterminated:
       // Missing closing delimiter can be recovered from.
       return false
     case .unprintableASCII, .invalidUTF8:
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index f6f31c075..649ea22e2 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -2079,21 +2079,21 @@ extension RegexTests {
 
     // MARK: Printable ASCII
 
-    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString)
+    delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated)
     for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r.
       delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII)
     }
-    delimiterLexingDiagnosticTest("re'\n'", .endOfString)
-    delimiterLexingDiagnosticTest("re'\r'", .endOfString)
+    delimiterLexingDiagnosticTest("re'\n'", .unterminated)
+    delimiterLexingDiagnosticTest("re'\r'", .unterminated)
     delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII)
 
     // MARK: Delimiter skipping
 
-    delimiterLexingDiagnosticTest("re'(?''", .endOfString)
-    delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString)
-    delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString)
-    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString)
-    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString)
+    delimiterLexingDiagnosticTest("re'(?''", .unterminated)
+    delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated)
+    delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
+    delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
+    delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
   }
 
   func testlibswiftDiagnostics() {

From 120ffc90de110ed3e2d1af382cb2f0f093e340da Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 11/17] Fix end-of-line-comment lexing

Previously we would just lex to the end of the
input, as it was assumed only single-line regex
would be supported. Update the implementation to
handle multi-line, and take account of PCRE global
options.
---
 .../Regex/AST/MatchingOptions.swift           |   3 +-
 .../Regex/Parse/LexicalAnalysis.swift         |  26 ++-
 Sources/_RegexParser/Regex/Parse/Parse.swift  |  13 ++
 Sources/_RegexParser/Regex/Parse/Source.swift |   6 +
 Tests/RegexTests/ParseTests.swift             | 208 ++++++++++++++++++
 5 files changed, 250 insertions(+), 6 deletions(-)

diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
index 25cb10842..808b51287 100644
--- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
+++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
@@ -137,7 +137,8 @@ extension AST {
   /// Global matching option specifiers. Unlike `MatchingOptionSequence`,
   /// these must appear at the start of the pattern, and apply globally.
   public struct GlobalMatchingOption: _ASTNode, Hashable {
-    /// Determines the definition of a newline for the '.' character class.
+    /// Determines the definition of a newline for the '.' character class and
+    /// when parsing end-of-line comments.
     public enum NewlineMatching: Hashable {
       /// (*CR*)
       case carriageReturnOnly
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index b595f3d29..165e97d1a 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -528,11 +528,27 @@ extension Source {
         return try src.expectQuoted(endingWith: "*/").value
       }
       if context.endOfLineComments, src.tryEat("#") {
-        // TODO: If we ever support multi-line regex literals, this will need
-        // to be updated to stop at a newline. Note though that PCRE specifies
-        // that the newline it matches against can be controlled by the global
-        // matching options e.g `(*CR)`, `(*ANY)`, ...
-        return src.lexUntil(\.isEmpty).value
+        // Try eat until we either exhaust the input, or hit a newline. Note
+        // that the definition of newline can be altered depending on the global
+        // matching options. By default we consider a newline to be `\n` or
+        // `\r`.
+        return src.lexUntil { src in
+          if src.isEmpty { return true }
+          switch context.newlineMode {
+          case .carriageReturnOnly:
+            return src.tryEat("\r")
+          case .linefeedOnly:
+            return src.tryEat("\n")
+          case .carriageAndLinefeedOnly:
+            return src.tryEat("\r\n")
+          case .anyCarriageReturnOrLinefeed:
+            return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil
+          case .anyUnicode:
+            return src.tryEat(where: \.isNewline)
+          case .nulCharacter:
+            return src.tryEat("\0")
+          }
+        }.value
       }
       return nil
     }
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 7867073e6..2512f9bf2 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -76,6 +76,10 @@ struct ParsingContext {
   /// The syntax options currently set.
   fileprivate(set) var syntax: SyntaxOptions
 
+  /// The current newline matching mode.
+  fileprivate(set) var newlineMode: AST.GlobalMatchingOption.NewlineMatching
+    = .anyCarriageReturnOrLinefeed
+
   fileprivate mutating func recordGroup(_ g: AST.Group.Kind) {
     // TODO: Needs to track group number resets (?|...).
     priorGroupCount += 1
@@ -139,6 +143,15 @@ extension Parser {
     // First parse any global matching options if present.
     let opts = try source.lexGlobalMatchingOptionSequence()
 
+    // If we have a newline mode global option, update the context accordingly.
+    if let opts = opts {
+      for opt in opts.options.reversed() {
+        guard case .newlineMatching(let newline) = opt.kind else { continue }
+        context.newlineMode = newline
+        break
+      }
+    }
+
     // Then parse the root AST node.
     let ast = try parseNode()
     guard source.isEmpty else {
diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift
index ddf0475f3..6eac16395 100644
--- a/Sources/_RegexParser/Regex/Parse/Source.swift
+++ b/Sources/_RegexParser/Regex/Parse/Source.swift
@@ -68,6 +68,12 @@ extension Source {
     return true
   }
 
+  mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool {
+    guard let next = peek(), try pred(next) else { return false }
+    advance()
+    return true
+  }
+
   mutating func tryEat<C: Collection>(sequence c: C) -> Bool
   where C.Element == Char {
     guard _slice.starts(with: c) else { return false }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index 649ea22e2..b185234a0 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1526,6 +1526,214 @@ extension RegexTests {
         matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b"))
     )
 
+    // Test multi-line comment handling.
+    parseTest(
+      """
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\n\r\
+      bc # d\n\r\
+      ef# g\n\r\
+      # h\n\r
+      """,
+      concat("b", "c", "e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(empty(), opts: .newlineMatching(.carriageReturnOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)\r\
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*LF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CRLF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CRLF)
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a\r\
+      bc # d\r\
+      ef# g\r\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANYCRLF)
+      # a\r
+      bc # d\r
+      ef# g\r
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANY)
+      # a
+      bc # d
+      ef# g
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      # a\u{2028}\
+      bc # d
+      ef# g\u{2028}\
+      # h
+      """,
+      concat("e", "f"),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*ANY)
+      # a\u{2028}\
+      bc # d\u{2028}\
+      ef# g\u{2028}\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*NUL)
+      # a
+      bc # d\0\
+      ef# g
+      # h
+      """,
+      ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*NUL)
+      # a\0\
+      bc # d\0\
+      ef# g\0\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)),
+      syntax: .extendedSyntax
+    )
+    parseTest(
+      """
+      (*CR)(*NUL)
+      # a\0\
+      bc # d\0\
+      ef# g\0\
+      # h
+      """,
+      ast(concat("b", "c", "e", "f"),
+          opts: .newlineMatching(.carriageReturnOnly),
+                .newlineMatching(.nulCharacter)
+         ),
+      syntax: .extendedSyntax
+    )
+
     // MARK: Parse with delimiters
 
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))

From 4944fbea80d5abbbcc2bc03cc511868aebae949e Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:52 +0100
Subject: [PATCH 12/17] Lex extended pound delimiters

Start lexing `/.../`, and allow any number of
pound signs to surround it.
---
 .../Regex/Parse/DelimiterLexing.swift         | 152 +++++++++++++-----
 Tests/RegexTests/LexTests.swift               |  25 +--
 Tests/RegexTests/ParseTests.swift             |  12 ++
 3 files changed, 143 insertions(+), 46 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index e88c1fa80..fa6ca978a 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -11,27 +11,27 @@
 
 // TODO: mock up multi-line soon
 
-enum Delimiter: Hashable, CaseIterable {
-  case traditional
-  case experimental
-  case reSingleQuote
-  case rxSingleQuote
-
-  var openingAndClosing: (opening: String, closing: String) {
-    switch self {
-    case .traditional: return ("#/", "/#")
-    case .experimental: return ("#|", "|#")
-    case .reSingleQuote: return ("re'", "'")
-    case .rxSingleQuote: return ("rx'", "'")
-    }
+struct Delimiter: Hashable {
+  let kind: Kind
+  let poundCount: Int
+
+  init(_ kind: Kind, poundCount: Int) {
+    precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
+    self.kind = kind
+    self.poundCount = poundCount
+  }
+
+  var opening: String {
+    String(repeating: "#", count: poundCount) + kind.opening
+  }
+  var closing: String {
+    kind.closing + String(repeating: "#", count: poundCount)
   }
-  var opening: String { openingAndClosing.opening }
-  var closing: String { openingAndClosing.closing }
 
   /// The default set of syntax options that the delimiter indicates.
   var defaultSyntaxOptions: SyntaxOptions {
-    switch self {
-    case .traditional, .reSingleQuote:
+    switch kind {
+    case .forwardSlash, .reSingleQuote:
       return .traditional
     case .experimental, .rxSingleQuote:
       return .experimental
@@ -39,6 +39,37 @@ enum Delimiter: Hashable, CaseIterable {
   }
 }
 
+extension Delimiter {
+  enum Kind: Hashable, CaseIterable {
+    case forwardSlash
+    case experimental
+    case reSingleQuote
+    case rxSingleQuote
+
+    var openingAndClosing: (opening: String, closing: String) {
+      switch self {
+      case .forwardSlash: return ("/", "/")
+      case .experimental: return ("#|", "|#")
+      case .reSingleQuote: return ("re'", "'")
+      case .rxSingleQuote: return ("rx'", "'")
+      }
+    }
+    var opening: String { openingAndClosing.opening }
+    var closing: String { openingAndClosing.closing }
+
+    /// Whether or not extended pound syntax e.g `##/.../##` is allowed with
+    /// this delimiter.
+    var allowsExtendedPoundSyntax: Bool {
+      switch self {
+      case .forwardSlash:
+        return true
+      case .experimental, .reSingleQuote, .rxSingleQuote:
+        return false
+      }
+    }
+  }
+}
+
 struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
     case unterminated
@@ -120,16 +151,25 @@ fileprivate struct DelimiterLexer {
     precondition(cursor <= end, "Cannot advance past end")
   }
 
-  /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
-  func canEat(_ utf8: String.UTF8View) -> Bool {
-    guard let slice = slice(utf8.count) else { return false }
-    return slice.elementsEqual(utf8)
+  /// Check to see if a byte sequence can be eaten from the current cursor.
+  func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
+    guard let slice = slice(bytes.count) else { return false }
+    return slice.elementsEqual(bytes)
+  }
+
+  /// Attempt to eat a byte sequence, returning `true` if successful.
+  mutating func tryEat<C : Collection>(
+    _ bytes: C
+  ) -> Bool where C.Element == UInt8 {
+    guard canEat(bytes) else { return false }
+    advanceCursor(bytes.count)
+    return true
   }
 
-  /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
-  mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
-    guard canEat(utf8) else { return false }
-    advanceCursor(utf8.count)
+  /// Attempt to eat an ascii scalar, returning `true` if successful.
+  mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
+    guard load() == ascii(s) else { return false }
+    advanceCursor()
     return true
   }
 
@@ -137,8 +177,8 @@ fileprivate struct DelimiterLexer {
   /// the actual closing delimiter.
   mutating func trySkipDelimiter(_ delimiter: Delimiter) {
     // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
-    switch delimiter {
-    case .traditional, .experimental:
+    switch delimiter.kind {
+    case .forwardSlash, .experimental:
       return
     case .reSingleQuote, .rxSingleQuote:
       break
@@ -272,16 +312,42 @@ fileprivate struct DelimiterLexer {
     }
   }
 
+  mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
+    for kind in Delimiter.Kind.allCases {
+      // If the delimiter allows extended pound syntax, or there are no pounds,
+      // we just need to lex it.
+      let opening = kind.opening.utf8
+      if kind.allowsExtendedPoundSyntax || poundCount == 0 {
+        guard tryEat(opening) else { continue }
+        return Delimiter(kind, poundCount: poundCount)
+      }
+
+      // The delimiter doesn't allow extended pound syntax, so the pounds must be
+      // part of the delimiter.
+      guard
+        poundCount < opening.count,
+        opening.prefix(poundCount)
+          .elementsEqual(repeatElement(ascii("#"), count: poundCount)),
+        tryEat(opening.dropFirst(poundCount))
+      else { continue }
+
+      return Delimiter(kind, poundCount: 0)
+    }
+    return nil
+  }
+
   /*consuming*/ mutating func lex(
   ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+    // We can consume any number of pound signs.
+    var poundCount = 0
+    while tryEat(ascii: "#") {
+      poundCount += 1
+    }
 
     // Try to lex the opening delimiter.
-    guard let delimiter = Delimiter.allCases.first(
-      where: { tryEat($0.opening.utf8) }
-    ) else {
+    guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
       throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
     }
-
     let contentsStart = cursor
     while true {
       // Check to see if we're at a character that looks like a delimiter, but
@@ -302,20 +368,34 @@ fileprivate struct DelimiterLexer {
 /// Drop a set of regex delimiters from the input string, returning the contents
 /// and the delimiter used. The input string must have valid delimiters.
 func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  func stripDelimiter(_ delim: Delimiter) -> String? {
+  func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
+    var slice = str.utf8[...]
+
+    // Try strip any number of opening '#'s.
+    var poundCount = 0
+    if kind.allowsExtendedPoundSyntax {
+      poundCount = slice.prefix(while: {
+        $0 == UInt8(("#" as UnicodeScalar).value)
+      }).count
+      slice = slice.dropFirst(poundCount)
+    }
+
     // The opening delimiter must match.
-    guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+    guard var slice = slice.tryDropPrefix(kind.opening.utf8)
     else { return nil }
 
     // The closing delimiter may optionally match, as it may not be present in
     // invalid code.
+    let delim = Delimiter(kind, poundCount: poundCount)
     if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
       slice = newSlice
     }
-    return String(slice)
+    let result = String(decoding: slice, as: UTF8.self)
+    precondition(result.utf8.elementsEqual(slice))
+    return (result, delim)
   }
-  for d in Delimiter.allCases {
-    if let contents = stripDelimiter(d) {
+  for kind in Delimiter.Kind.allCases {
+    if let (contents, d) = stripDelimiter(kind) {
       return (contents, d)
     }
   }
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index c50191d05..d11be6c34 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -101,26 +101,31 @@ extension RegexTests {
 
 
   func testCompilerInterface() {
+    func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter {
+      Delimiter(kind, poundCount: poundCount)
+    }
     let testCases: [(String, (String, Delimiter)?)] = [
-      ("#/abc/#", ("abc", .traditional)),
-      ("#|abc|#", ("abc", .experimental)),
+      ("/abc/", ("abc", delim(.forwardSlash))),
+      ("#/abc/#", ("abc", delim(.forwardSlash, poundCount: 1))),
+      ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
+      ("#|abc|#", ("abc", delim(.experimental))),
 
       // TODO: Null characters are lexically valid, similar to string literals,
       // but we ought to warn the user about them.
-      ("#|ab\0c|#", ("ab\0c", .experimental)),
+      ("#|ab\0c|#", ("ab\0c", delim(.experimental))),
       ("'abc'", nil),
-      ("#/abc/def/#", ("abc/def", .traditional)),
-      ("#|abc|def|#", ("abc|def", .experimental)),
-      ("#/abc\\/#def/#", ("abc\\/#def", .traditional)),
-      ("#|abc\\|#def|#", ("abc\\|#def", .experimental)),
-      ("#/abc|#def/#", ("abc|#def", .traditional)),
-      ("#|abc/#def|#", ("abc/#def", .experimental)),
+      ("#/abc/def/#", ("abc/def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc|def|#", ("abc|def", delim(.experimental))),
+      ("#/abc\\/#def/#", ("abc\\/#def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc\\|#def|#", ("abc\\|#def", delim(.experimental))),
+      ("#/abc|#def/#", ("abc|#def", delim(.forwardSlash, poundCount: 1))),
+      ("#|abc/#def|#", ("abc/#def", delim(.experimental))),
       ("#/abc|#def/", nil),
       ("#|abc/#def#", nil),
       ("#/abc\n/#", nil),
       ("#/abc\r/#", nil),
 
-      (#"re'abcre\''"#, (#"abcre\'"#, .reSingleQuote)),
+      (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))),
       (#"re'\'"#, nil)
     ]
 
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index b185234a0..c4f13ffd9 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -1736,7 +1736,9 @@ extension RegexTests {
 
     // MARK: Parse with delimiters
 
+    parseWithDelimitersTest("/a b/", concat("a", " ", "b"))
     parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))
+    parseWithDelimitersTest("##/a b/##", concat("a", " ", "b"))
     parseWithDelimitersTest("#|a b|#", concat("a", "b"))
 
     parseWithDelimitersTest("re'a b'", concat("a", " ", "b"))
@@ -1773,6 +1775,11 @@ extension RegexTests {
     // Printable ASCII characters.
     delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
 
+    // Make sure we can handle a combining accent as first character.
+    parseWithDelimitersTest("/\u{301}/", "\u{301}")
+
+    delimiterLexingTest("/a/#", ignoreTrailing: true)
+
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
     // if it's clear that it's part of the regex syntax.
 
@@ -2302,6 +2309,11 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated)
     delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated)
     delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated)
+
+    // MARK: Unbalanced extended syntax
+    delimiterLexingDiagnosticTest("#/a/", .unterminated)
+    delimiterLexingDiagnosticTest("##/a/#", .unterminated)
+
   }
 
   func testlibswiftDiagnostics() {

From 9f42ea4ce07194030e63ec104438a0bf4d9e12bd Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:53 +0100
Subject: [PATCH 13/17] Introduce a multi-line literal mode

When an extended delimiter `#/` is followed by a
newline, enter a multi-line mode where the literal
may span multiple lines, and extended syntax is
enabled by default.
---
 .../Regex/Parse/DelimiterLexing.swift         |  67 +++++++--
 .../Regex/Parse/Diagnostics.swift             |   4 +
 .../Regex/Parse/LexicalAnalysis.swift         |   8 +-
 .../_RegexParser/Regex/Parse/Mocking.swift    |   4 +-
 Sources/_RegexParser/Regex/Parse/Parse.swift  |  46 +++++--
 .../Regex/Parse/SyntaxOptions.swift           |   5 +
 Tests/RegexTests/LexTests.swift               |   5 +
 Tests/RegexTests/ParseTests.swift             | 127 ++++++++++++++++++
 8 files changed, 239 insertions(+), 27 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index fa6ca978a..a9f92ade3 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -9,8 +9,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: mock up multi-line soon
-
 struct Delimiter: Hashable {
   let kind: Kind
   let poundCount: Int
@@ -28,13 +26,13 @@ struct Delimiter: Hashable {
     kind.closing + String(repeating: "#", count: poundCount)
   }
 
-  /// The default set of syntax options that the delimiter indicates.
-  var defaultSyntaxOptions: SyntaxOptions {
+  /// Whether or not multi-line mode is permitted.
+  var allowsMultiline: Bool {
     switch kind {
-    case .forwardSlash, .reSingleQuote:
-      return .traditional
-    case .experimental, .rxSingleQuote:
-      return .experimental
+    case .forwardSlash:
+      return poundCount > 0
+    case .experimental, .reSingleQuote, .rxSingleQuote:
+      return false
     }
   }
 }
@@ -76,6 +74,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
     case unprintableASCII
+    case multilineClosingNotOnNewline
   }
 
   var kind: Kind
@@ -94,6 +93,7 @@ struct DelimiterLexError: Error, CustomStringConvertible {
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
     case .unprintableASCII: return "unprintable ASCII character found in source file"
+    case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
     }
   }
 }
@@ -103,6 +103,9 @@ fileprivate struct DelimiterLexer {
   var cursor: UnsafeRawPointer
   let end: UnsafeRawPointer
 
+  var firstNewline: UnsafeRawPointer?
+  var isMultiline: Bool { firstNewline != nil }
+
   init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
     precondition(start <= end)
     self.start = start
@@ -262,12 +265,23 @@ fileprivate struct DelimiterLexer {
     let contentsEnd = cursor
     guard tryEat(delimiter.closing.utf8) else { return nil }
 
-    // Form a string from the contents and make sure it's valid UTF-8.
     let count = contentsEnd - contentsStart
     let contents = UnsafeRawBufferPointer(
       start: contentsStart, count: count)
-    let s = String(decoding: contents, as: UTF8.self)
 
+    // In multi-line mode, we must be on a new line. So scan backwards and make
+    // sure we only have whitespace until the newline.
+    if isMultiline {
+      let idx = contents.lastIndex(
+        where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
+      guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
+      else {
+        throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
+      }
+    }
+
+    // Form a string from the contents and make sure it's valid UTF-8.
+    let s = String(decoding: contents, as: UTF8.self)
     guard s.utf8.elementsEqual(contents) else {
       throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
     }
@@ -278,7 +292,10 @@ fileprivate struct DelimiterLexer {
   /// the end of the buffer is reached.
   mutating func advance(escaped: Bool = false) throws {
     guard let next = load() else {
-      throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      // We've hit the end of the buffer. In multi-line mode, we don't want to
+      // skip over what is likely otherwise valid Swift code, so resume from the
+      // first newline.
+      throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
     }
     switch UnicodeScalar(next) {
     case let next where !next.isASCII:
@@ -289,7 +306,10 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
 
     case "\n", "\r":
-      throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      guard isMultiline else {
+        throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      }
+      advanceCursor()
 
     case "\0":
       // TODO: Warn to match the behavior of String literal lexer? Or should
@@ -301,8 +321,12 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
       try advance(escaped: true)
 
-    case let next where !next.isPrintableASCII:
+    case let next
+      where !next.isPrintableASCII && !(isMultiline && next == "\t"):
       // Diagnose unprintable ASCII.
+      // Note that tabs are allowed in multi-line literals.
+      // TODO: This matches the string literal behavior, but should we allow
+      // tabs for single-line regex literals too?
       // TODO: Ideally we would recover and continue to lex until the ending
       // delimiter.
       throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
@@ -349,6 +373,23 @@ fileprivate struct DelimiterLexer {
       throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
     }
     let contentsStart = cursor
+
+    // If the delimiter allows multi-line, try skipping over any whitespace to a
+    // newline character. If we can do that, we enter multi-line mode.
+    if delimiter.allowsMultiline {
+      while let next = load() {
+        switch next {
+        case ascii(" "), ascii("\t"):
+          advanceCursor()
+          continue
+        case ascii("\n"), ascii("\r"):
+          firstNewline = cursor
+        default:
+          break
+        }
+        break
+      }
+    }
     while true {
       // Check to see if we're at a character that looks like a delimiter, but
       // likely isn't. In such a case, we can attempt to skip over it.
diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
index d4c809045..621d6ea11 100644
--- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
+++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -70,6 +70,8 @@ enum ParseError: Error, Hashable {
 
   case cannotRemoveTextSegmentOptions
   case cannotRemoveSemanticsOptions
+  case cannotRemoveExtendedSyntaxInMultilineMode
+
   case expectedCalloutArgument
 }
 
@@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible {
       return "text segment mode cannot be unset, only changed"
     case .cannotRemoveSemanticsOptions:
       return "semantic level cannot be unset, only changed"
+    case .cannotRemoveExtendedSyntaxInMultilineMode:
+      return "extended syntax may not be disabled in multi-line mode"
     case .expectedCalloutArgument:
       return "expected argument to callout"
     }
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
index 165e97d1a..c48d53de9 100644
--- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
+++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -657,6 +657,7 @@ extension Source {
   ///                        | MatchingOption* '-' MatchingOption*
   ///
   mutating func lexMatchingOptionSequence(
+    context: ParsingContext
   ) throws -> AST.MatchingOptionSequence? {
     let ateCaret = recordLoc { $0.tryEat("^") }
 
@@ -691,6 +692,11 @@ extension Source {
         if opt.isSemanticMatchingLevel {
           throw ParseError.cannotRemoveSemanticsOptions
         }
+        // Extended syntax may not be removed if in multi-line mode.
+        if context.syntax.contains(.multilineExtendedSyntax) &&
+            opt.isAnyExtended {
+          throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
+        }
         removing.append(opt)
       }
       return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
@@ -864,7 +870,7 @@ extension Source {
           }
 
           // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:).
-          if let seq = try src.lexMatchingOptionSequence() {
+          if let seq = try src.lexMatchingOptionSequence(context: context) {
             if src.tryEat(":") {
               return .changeMatchingOptions(seq, isIsolated: false)
             }
diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift
index 596a59bf4..dd02e0fc7 100644
--- a/Sources/_RegexParser/Regex/Parse/Mocking.swift
+++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift
@@ -62,8 +62,8 @@ func libswiftLexRegexLiteral(
     curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self)
 
     switch error.kind {
-    case .unterminated:
-      // Missing closing delimiter can be recovered from.
+    case .unterminated, .multilineClosingNotOnNewline:
+      // These can be recovered from.
       return false
     case .unprintableASCII, .invalidUTF8:
       // We don't currently have good recovery behavior for these.
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
index 2512f9bf2..c3aa3500b 100644
--- a/Sources/_RegexParser/Regex/Parse/Parse.swift
+++ b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -288,22 +288,25 @@ extension Parser {
   ) throws -> AST.Group {
     context.recordGroup(kind.value)
 
-    // Check if we're introducing or removing extended syntax.
+    // Check if we're introducing or removing extended syntax. We skip this for
+    // multi-line, as extended syntax is always enabled there.
     // TODO: PCRE differentiates between (?x) and (?xx) where only the latter
     // handles non-semantic whitespace in a custom character class. Other
     // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
     // treat (?x) and (?xx) as the same option here. If we ever get a strict
     // PCRE mode, we will need to change this to handle that.
     let currentSyntax = context.syntax
-    if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
-      if c.resetsCurrentOptions {
-        context.syntax.remove(.extendedSyntax)
-      }
-      if c.adding.contains(where: \.isAnyExtended) {
-        context.syntax.insert(.extendedSyntax)
-      }
-      if c.removing.contains(where: \.isAnyExtended) {
-        context.syntax.remove(.extendedSyntax)
+    if !context.syntax.contains(.multilineExtendedSyntax) {
+      if case .changeMatchingOptions(let c, isIsolated: _) = kind.value {
+        if c.resetsCurrentOptions {
+          context.syntax.remove(.extendedSyntax)
+        }
+        if c.adding.contains(where: \.isAnyExtended) {
+          context.syntax.insert(.extendedSyntax)
+        }
+        if c.removing.contains(where: \.isAnyExtended) {
+          context.syntax.remove(.extendedSyntax)
+        }
       }
     }
     defer {
@@ -532,11 +535,32 @@ public func parse<S: StringProtocol>(
   return try parser.parse()
 }
 
+/// Retrieve the default set of syntax options that a delimiter and literal
+/// contents indicates.
+fileprivate func defaultSyntaxOptions(
+  _ delim: Delimiter, contents: String
+) -> SyntaxOptions {
+  switch delim.kind {
+  case .forwardSlash:
+    // For an extended syntax forward slash e.g #/.../#, extended syntax is
+    // permitted if it spans multiple lines.
+    if delim.poundCount > 0 &&
+        contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
+      return .multilineExtendedSyntax
+    }
+    return .traditional
+  case .reSingleQuote:
+    return .traditional
+  case .experimental, .rxSingleQuote:
+    return .experimental
+  }
+}
+
 /// Parse a given regex string with delimiters, inferring the syntax options
 /// from the delimiter used.
 public func parseWithDelimiters<S: StringProtocol>(
   _ regex: S
 ) throws -> AST where S.SubSequence == Substring {
   let (contents, delim) = droppingRegexDelimiters(String(regex))
-  return try parse(contents, delim.defaultSyntaxOptions)
+  return try parse(contents, defaultSyntaxOptions(delim, contents: contents))
 }
diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
index 5135d8ec1..b7c09ea1c 100644
--- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
+++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
@@ -58,6 +58,11 @@ public struct SyntaxOptions: OptionSet {
   ///  `(_: .*)` == `(?:.*)`
   public static var experimentalCaptures: Self { Self(1 << 5) }
 
+  /// The default syntax for a multi-line regex literal.
+  public static var multilineExtendedSyntax: Self {
+    return [Self(1 << 6), .extendedSyntax]
+  }
+
   /*
 
     /// `<digit>*` == `[[:digit:]]*` == `\d*`
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index d11be6c34..5c304fe58 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -110,6 +110,11 @@ extension RegexTests {
       ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))),
       ("#|abc|#", ("abc", delim(.experimental))),
 
+      // Multiline
+      ("#/\na\nb\n/#", ("\na\nb\n", delim(.forwardSlash, poundCount: 1))),
+      ("#/ \na\nb\n  /#", (" \na\nb\n  ", delim(.forwardSlash, poundCount: 1))),
+      ("##/ \na\nb\n  /##", (" \na\nb\n  ", delim(.forwardSlash, poundCount: 2))),
+
       // TODO: Null characters are lexically valid, similar to string literals,
       // but we ought to warn the user about them.
       ("#|ab\0c|#", ("ab\0c", delim(.experimental))),
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index c4f13ffd9..c40cb86ca 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -223,6 +223,36 @@ func diagnosticTest(
   }
 }
 
+func diagnosticWithDelimitersTest(
+  _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false,
+  file: StaticString = #file, line: UInt = #line
+) {
+  // First try lexing.
+  let literal = delimiterLexingTest(
+    input, ignoreTrailing: ignoreTrailing, file: file, line: line)
+
+  do {
+    let orig = try parseWithDelimiters(literal)
+    let ast = orig.root
+    XCTFail("""
+
+      Passed \(ast)
+      But expected error: \(expected)
+    """, file: file, line: line)
+  } catch let e as Source.LocatedError<ParseError> {
+    guard e.error == expected else {
+      XCTFail("""
+
+        Expected: \(expected)
+        Actual: \(e.error)
+      """, file: file, line: line)
+      return
+    }
+  } catch let e {
+    XCTFail("Error without source location: \(e)", file: file, line: line)
+  }
+}
+
 func delimiterLexingDiagnosticTest(
   _ input: String, _ expected: DelimiterLexError.Kind,
   syntax: SyntaxOptions = .traditional,
@@ -1403,6 +1433,18 @@ extension RegexTests {
     parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions(
       adding: .extraExtended), isIsolated: true, concat(" ")))
 
+    parseTest(
+      "(?x) a (?^) b",
+      changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true,
+        concat(
+          "a",
+          changeMatchingOptions(
+            unsetMatchingOptions(), isIsolated: true, concat(" ", "b"))
+        )
+      )
+    )
+
     // End of line comments aren't applicable in custom char classes.
     // TODO: ICU supports this.
     parseTest(
@@ -1780,6 +1822,56 @@ extension RegexTests {
 
     delimiterLexingTest("/a/#", ignoreTrailing: true)
 
+    // MARK: Multiline
+
+    parseWithDelimitersTest("#/\n/#", empty())
+    parseWithDelimitersTest("#/\r/#", empty())
+    parseWithDelimitersTest("#/\r\n/#", empty())
+    parseWithDelimitersTest("#/\n\t\t  /#", empty())
+    parseWithDelimitersTest("#/  \t\t\n\t\t  /#", empty())
+
+    parseWithDelimitersTest("#/\n a \n/#", "a")
+    parseWithDelimitersTest("#/\r a \r/#", "a")
+    parseWithDelimitersTest("#/\r\n a \r\n/#", "a")
+    parseWithDelimitersTest("#/\n a \n\t\t  /#", "a")
+    parseWithDelimitersTest("#/\t  \n a \n\t\t  /#", "a")
+
+    parseWithDelimitersTest("""
+      #/
+      a
+        b
+           c
+         /#
+      """, concat("a", "b", "c"))
+
+    parseWithDelimitersTest("""
+      #/
+      a    # comment
+        b # another
+      #
+         /#
+      """, concat("a", "b"))
+
+    // Make sure (?^) is ignored.
+    parseWithDelimitersTest("""
+      #/
+      (?^)
+      # comment
+      /#
+      """, changeMatchingOptions(
+        unsetMatchingOptions(), isIsolated: true, empty())
+    )
+
+    // (?x) has no effect.
+    parseWithDelimitersTest("""
+      #/
+      (?x)
+      # comment
+      /#
+      """, changeMatchingOptions(
+        matchingOptions(adding: .extended), isIsolated: true, empty())
+    )
+
     // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
     // if it's clear that it's part of the regex syntax.
 
@@ -2162,6 +2254,32 @@ extension RegexTests {
     diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions)
     diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions)
 
+    // Extended syntax may not be removed in multi-line mode.
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-x)a b
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-xx)a b
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-x:a b)
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+    diagnosticWithDelimitersTest("""
+      #/
+      (?-xx:a b)
+      /#
+      """, .cannotRemoveExtendedSyntaxInMultilineMode
+    )
+
     // MARK: Group specifiers
 
     diagnosticTest(#"(*"#, .unknownGroupKind("*"))
@@ -2314,6 +2432,15 @@ extension RegexTests {
     delimiterLexingDiagnosticTest("#/a/", .unterminated)
     delimiterLexingDiagnosticTest("##/a/#", .unterminated)
 
+    // MARK: Multiline
+
+    // Can only be done if pound signs are used.
+    delimiterLexingDiagnosticTest("/\n/", .unterminated)
+
+    // Opening and closing delimiters must be on a newline.
+    delimiterLexingDiagnosticTest("#/a\n/#", .unterminated)
+    delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline)
+    delimiterLexingDiagnosticTest("#/\n#/#", .multilineClosingNotOnNewline)
   }
 
   func testlibswiftDiagnostics() {

From 556bca0abb2bd1623664481f9aa31be0ed19af1f Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 14:34:53 +0100
Subject: [PATCH 14/17] Disable unused delimiters

Leave only `/.../` (and its extended syntax)
enabled for now.
---
 .../Regex/Parse/DelimiterLexing.swift         | 21 ++++++++++++----
 Tests/RegexTests/LexTests.swift               | 24 +++++++++++++++++--
 Tests/RegexTests/ParseTests.swift             |  6 +++--
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
index a9f92ade3..bee782043 100644
--- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
+++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -35,6 +35,12 @@ struct Delimiter: Hashable {
       return false
     }
   }
+
+  /// The delimiters which are currently enabled.
+  static var enabledDelimiters: [Kind] { [.forwardSlash] }
+
+  /// All known delimiters.
+  static var allDelimiters: [Kind] { Kind.allCases }
 }
 
 extension Delimiter {
@@ -106,11 +112,15 @@ fileprivate struct DelimiterLexer {
   var firstNewline: UnsafeRawPointer?
   var isMultiline: Bool { firstNewline != nil }
 
-  init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+  let delimiters: [Delimiter.Kind]
+
+  init(start: UnsafeRawPointer, end: UnsafeRawPointer,
+       delimiters: [Delimiter.Kind]) {
     precondition(start <= end)
     self.start = start
     self.cursor = start
     self.end = end
+    self.delimiters = delimiters
   }
 
   func ascii(_ s: Unicode.Scalar) -> UInt8 {
@@ -337,7 +347,7 @@ fileprivate struct DelimiterLexer {
   }
 
   mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
-    for kind in Delimiter.Kind.allCases {
+    for kind in delimiters {
       // If the delimiter allows extended pound syntax, or there are no pounds,
       // we just need to lex it.
       let opening = kind.opening.utf8
@@ -435,7 +445,7 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
     precondition(result.utf8.elementsEqual(slice))
     return (result, delim)
   }
-  for kind in Delimiter.Kind.allCases {
+  for kind in Delimiter.allDelimiters {
     if let (contents, d) = stripDelimiter(kind) {
       return (contents, d)
     }
@@ -446,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
 /// Attempt to lex a regex literal between `start` and `end`, returning either
 /// the contents and pointer from which to resume lexing, or an error.
 func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
+  start: UnsafeRawPointer, end: UnsafeRawPointer,
+  delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters
 ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  var lexer = DelimiterLexer(start: start, end: end)
+  var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters)
   return try lexer.lex()
 }
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
index 5c304fe58..958c53c26 100644
--- a/Tests/RegexTests/LexTests.swift
+++ b/Tests/RegexTests/LexTests.swift
@@ -100,7 +100,7 @@ extension RegexTests {
   }
 
 
-  func testCompilerInterface() {
+  func testCompilerInterface() throws {
     func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter {
       Delimiter(kind, poundCount: poundCount)
     }
@@ -138,7 +138,9 @@ extension RegexTests {
       input.withCString {
         let endPtr = $0 + input.utf8.count
         assert(endPtr.pointee == 0)
-        guard let out = try? lexRegex(start: $0, end: endPtr) else {
+        guard let out = try? lexRegex(
+          start: $0, end: endPtr, delimiters: Delimiter.allDelimiters)
+        else {
           XCTAssertNil(expected)
           return
         }
@@ -150,5 +152,23 @@ extension RegexTests {
         XCTAssertEqual(expected?.1, droppedDelimiters.1)
       }
     }
+
+    // TODO: Remove the lexing code for these if we no longer need them.
+    let disabledDelimiters: [String] = [
+      "#|x|#", "re'x'", "rx'y'"
+    ]
+
+    for input in disabledDelimiters {
+      try input.withCString {
+        let endPtr = $0 + input.utf8.count
+        assert(endPtr.pointee == 0)
+        do {
+          _ = try lexRegex(start: $0, end: endPtr)
+          XCTFail()
+        } catch let e as DelimiterLexError {
+          XCTAssertEqual(e.kind, .unknownDelimiter)
+        }
+      }
+    }
   }
 }
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
index c40cb86ca..c6ff3e46d 100644
--- a/Tests/RegexTests/ParseTests.swift
+++ b/Tests/RegexTests/ParseTests.swift
@@ -117,7 +117,8 @@ func delimiterLexingTest(
 ) -> String {
   input.withCString(encodedAs: UTF8.self) { ptr in
     let endPtr = ptr + input.utf8.count
-    let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr)
+    let (contents, delim, end) = try! lexRegex(
+      start: ptr, end: endPtr, delimiters: Delimiter.allDelimiters)
     if ignoreTrailing {
       XCTAssertNotEqual(end, endPtr, file: file, line: line)
     } else {
@@ -260,7 +261,8 @@ func delimiterLexingDiagnosticTest(
 ) {
   do {
     _ = try input.withCString { ptr in
-      try lexRegex(start: ptr, end: ptr + input.count)
+      try lexRegex(
+        start: ptr, end: ptr + input.count, delimiters: Delimiter.allDelimiters)
     }
     XCTFail("""
       Passed, but expected error: \(expected)

From 820ab38300b6eff0d52e569612cc35ee8849741d Mon Sep 17 00:00:00 2001
From: Michael Ilseman <michael.ilseman@gmail.com>
Date: Fri, 1 Apr 2022 09:12:52 -0600
Subject: [PATCH 15/17] Regex Type and Overview V2 and accompanying
 tests/changes (#241)

* Clarify contractions
* Motivation tests, API updates, and text
---
 Documentation/Evolution/RegexSyntax.md        |  33 ++-
 Documentation/Evolution/RegexTypeOverview.md  |  58 ++--
 .../Participants/RegexParticipant.swift       |   4 +-
 Sources/RegexBuilder/Match.swift              |  20 +-
 .../Algorithms/Consumers/RegexConsumer.swift  |   2 +-
 .../Regex/AnyRegexOutput.swift                |  13 +-
 Sources/_StringProcessing/Regex/Core.swift    |  76 ++---
 Sources/_StringProcessing/Regex/Match.swift   |  91 +++++-
 Tests/RegexBuilderTests/CustomTests.swift     |   2 +-
 Tests/RegexBuilderTests/MotivationTests.swift | 267 ++++++++++++++++++
 Tests/RegexBuilderTests/RegexDSLTests.swift   |  64 ++---
 Tests/RegexTests/AlgorithmsTests.swift        |   8 +-
 12 files changed, 524 insertions(+), 114 deletions(-)
 create mode 100644 Tests/RegexBuilderTests/MotivationTests.swift

diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md
index 5bfdd5e8a..958d52f7d 100644
--- a/Documentation/Evolution/RegexSyntax.md
+++ b/Documentation/Evolution/RegexSyntax.md
@@ -2,27 +2,38 @@
 Hello, we want to issue an update to [Regular Expression Literals](https://forums.swift.org/t/pitch-regular-expression-literals/52820) and prepare for a formal proposal. The great delimiter deliberation continues to unfold, so in the meantime, we have a significant amount of surface area to present for review/feedback: the syntax _inside_ a regex literal. Additionally, this is the syntax accepted from a string used for run-time regex construction, so we're devoting an entire pitch/proposal to the topic of _regex syntax_, distinct from the result builder DSL or the choice of delimiters for literals. 
 -->
 
-# Regex Syntax
+# Run-time Regex Construction
 
-- Authors: Hamish Knight, Michael Ilseman
+- Authors: [Hamish Knight](https://github.com/hamishknight), [Michael Ilseman](https://github.com/milseman)
 
 ## Introduction
 
-A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Regexes can be created from a string at run time or from a literal at compile time. The contents of that run-time string, or the contents in-between the compile-time literal's delimiters, uses regex syntax. We present a detailed and comprehensive treatment of regex syntax.
-
-This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. This proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status.
+A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. We propose the ability to create a regex at run time from a string containing regex syntax (detailed here), API for accessing the match and captures, and a means to convert between an existential capture representation and concrete types.
 
+The overall story is laid out in [Regex Type and Overview](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexTypeOverview.md) and each individual component is tracked in [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107).
 
 ## Motivation
 
 Swift aims to be a pragmatic programming language, striking a balance between familiarity, interoperability, and advancing the art. Swift's `String` presents a uniquely Unicode-forward model of string, but currently suffers from limited processing facilities.
 
+<!--
+... tools need run time construction
+...  ns regular expression operates over a fundamentally different model and has limited syntactic and semantic support
+... we prpose a best-in-class treatment of familiar regex syntax
+-->
+
 The full string processing effort includes a regex type with strongly typed captures, the ability to create a regex from a string at runtime, a compile-time literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, and a slew of regex-powered algorithms over strings.
 
 This proposal specifically hones in on the _familiarity_ aspect by providing a best-in-class treatment of familiar regex syntax.
 
 ## Proposed Solution
 
+<!--
+ ... regex compiling and existential match type
+-->
+
+### Syntax
+
 We propose accepting a syntactic "superset" of the following existing regular expression engines:
 
 - [PCRE 2][pcre2-syntax], an "industry standard" and a rough superset of Perl, Python, etc.
@@ -40,6 +51,10 @@ Regex syntax will be part of Swift's source-compatibility story as well as its b
 
 ## Detailed Design
 
+<!--
+ ... init, dynamic match, conversion to static
+ -->
+
 We propose the following syntax for regex.
 
 <details><summary>Grammar Notation</summary>
@@ -832,6 +847,14 @@ Regex syntax will become part of Swift's source and binary-compatibility story,
 Even though it is more work up-front and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex syntax.
 
 
+<!-- 
+
+### TODO: Semantic capabilities
+
+This proposal regards _syntactic_ support, and does not necessarily mean that everything that can be parsed will be supported by Swift's engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status.
+
+ -->
+
 [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html
 [oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE
 [icu-syntax]: https://unicode-org.github.io/icu/userguide/strings/regexp.html
diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md
index 6fe3bc0bf..504111181 100644
--- a/Documentation/Evolution/RegexTypeOverview.md
+++ b/Documentation/Evolution/RegexTypeOverview.md
@@ -149,11 +149,11 @@ Type mismatches and invalid regex syntax are diagnosed at construction time by `
 When the pattern is known at compile time, regexes can be created from a literal containing the same regex syntax, allowing the compiler to infer the output type. Regex literals enable source tools, e.g. syntax highlighting and actions to refactor into a result builder equivalent.
 
 ```swift
-let regex = re'(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)'
+let regex = /(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)/
 // regex: Regex<(Substring, Substring, Substring, Substring, Substring)>
 ```
 
-*Note*: Regex literals, most notably the choice of delimiter, are discussed in [Regex Literals][pitches]. For this example, I used the less technically-problematic option of `re'...'`.
+*Note*: Regex literals, most notably the choice of delimiter, are discussed in [Regex Literals][pitches].
 
 This same regex can be created from a result builder, a refactoring-friendly representation:
 
@@ -193,13 +193,13 @@ A `Regex<Output>.Match` contains the result of a match, surfacing captures by nu
 
 ```swift
 func processEntry(_ line: String) -> Transaction? {
-  let regex = re'''
-    (?x) # Ignore whitespace and comments
+  // Multiline literal implies `(?x)`, i.e. non-semantic whitespace with line-ending comments
+  let regex = #/
     (?<kind>    \w+)                \s\s+
     (?<date>    \S+)                \s\s+
     (?<account> (?: (?!\s\s) . )+)  \s\s+
     (?<amount>  .*)
-    '''
+  /#
   //  regex: Regex<(
   //    Substring,
   //    kind: Substring,
@@ -291,7 +291,7 @@ A regex describes an algorithm to be ran over some model of string, and Swift's
 
 Calling `dropFirst()` will not drop a leading byte or `Unicode.Scalar`, but rather a full `Character`. Similarly, a `.` in a regex will match any extended grapheme cluster. A regex will match canonical equivalents by default, strengthening the connection between regex and the equivalent `String` operations.
 
-Additionally, word boundaries (`\b`) follow [UTS\#29 Word Boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries), meaning contractions ("don't") and script changes are detected and separated, without incurring significant binary size costs associated with language dictionaries.
+Additionally, word boundaries (`\b`) follow [UTS\#29 Word Boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries). Contractions ("don't") are correctly detected and script changes are separated, without incurring significant binary size costs associated with language dictionaries.
 
 Regex targets [UTS\#18 Level 2](https://www.unicode.org/reports/tr18/#Extended_Unicode_Support) by default, but provides options to switch to scalar-level processing as well as compatibility character classes. Detailed rules on how we infer necessary grapheme cluster breaks inside regexes, as well as options and other concerns, are discussed in [Unicode for String Processing][pitches]. 
 
@@ -300,18 +300,47 @@ Regex targets [UTS\#18 Level 2](https://www.unicode.org/reports/tr18/#Extended_U
 
 ```swift
 /// A regex represents a string processing algorithm.
+///
+///     let regex = try Regex(compiling: "a(.*)b")
+///     let match = "cbaxb".firstMatch(of: regex)
+///     print(match.0) // "axb"
+///     print(match.1) // "x"
+///
 public struct Regex<Output> {
   /// Match a string in its entirety.
   ///
   /// Returns `nil` if no match and throws on abort
-  public func matchWhole(_: String) throws -> Match?
+  public func matchWhole(_ s: String) throws -> Regex<Output>.Match?
 
-  /// Match at the front of a string
+  /// Match part of the string, starting at the beginning.
   ///
   /// Returns `nil` if no match and throws on abort
-  public func matchFront(_: String) throws -> Match?
+  public func matchPrefix(_ s: String) throws -> Regex<Output>.Match?
+
+  /// Find the first match in a string
+  ///
+  /// Returns `nil` if no match is found and throws on abort
+  public func firstMatch(in s: String) throws -> Regex<Output>.Match?
+
+  /// Match a substring in its entirety.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchWhole(_ s: Substring) throws -> Regex<Output>.Match?
+
+  /// Match part of the string, starting at the beginning.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchPrefix(_ s: Substring) throws -> Regex<Output>.Match?
+
+  /// Find the first match in a substring
+  ///
+  /// Returns `nil` if no match is found and throws on abort
+  public func firstMatch(_ s: Substring) throws -> Regex<Output>.Match?
 
   /// The result of matching a regex against a string.
+  ///
+  /// A `Match` forwards API to the `Output` generic parameter,
+  /// providing direct access to captures.
   @dynamicMemberLookup
   public struct Match {
     /// The range of the overall match
@@ -320,7 +349,7 @@ public struct Regex<Output> {
     /// The produced output from the match operation
     public var output: Output
   
-    /// Lookup a capture by number
+    /// Lookup a capture by name or number
     public subscript<T>(dynamicMember keyPath: KeyPath<Output, T>) -> T
   
     /// Lookup a capture by number
@@ -342,11 +371,6 @@ public struct Regex<Output> {
 extension Regex: RegexComponent {
   public var regex: Regex<Output> { self }
 
-  /// Create a regex out of a single component
-  public init<Content: RegexComponent>(
-    _ content: Content
-  ) where Content.Output == Output
-
   /// Result builder interface
   public init<Content: RegexComponent>(
     @RegexComponentBuilder _ content: () -> Content
@@ -360,11 +384,11 @@ extension Regex.Match {
 
 // Run-time compilation interfaces
 extension Regex {
-  /// Parse and compile `pattern`.
+  /// Parse and compile `pattern`, resulting in a strongly-typed capture list.
   public init(compiling pattern: String, as: Output.Type = Output.self) throws
 }
 extension Regex where Output == AnyRegexOutput {
-  /// Parse and compile `pattern`.
+  /// Parse and compile `pattern`, resulting in an existentially-typed capture list.
   public init(compiling pattern: String) throws
 }
 ```
diff --git a/Sources/Exercises/Participants/RegexParticipant.swift b/Sources/Exercises/Participants/RegexParticipant.swift
index 731b9b6f6..a40de3953 100644
--- a/Sources/Exercises/Participants/RegexParticipant.swift
+++ b/Sources/Exercises/Participants/RegexParticipant.swift
@@ -63,7 +63,7 @@ private func graphemeBreakPropertyData<RP: RegexComponent>(
   forLine line: String,
   using regex: RP
 ) -> GraphemeBreakEntry? where RP.Output == (Substring, Substring, Substring?, Substring) {
-  line.match(regex).map(\.output).flatMap(extractFromCaptures)
+  line.matchWhole(regex).map(\.output).flatMap(extractFromCaptures)
 }
 
 private func graphemeBreakPropertyDataLiteral(
@@ -80,7 +80,7 @@ private func graphemeBreakPropertyDataLiteral(
 private func graphemeBreakPropertyData(
   forLine line: String
 ) -> GraphemeBreakEntry? {
-  line.match {
+  line.matchWhole {
     TryCapture(OneOrMore(.hexDigit)) { Unicode.Scalar(hex: $0) }
     Optionally {
       ".."
diff --git a/Sources/RegexBuilder/Match.swift b/Sources/RegexBuilder/Match.swift
index 3f86f9498..ac07ec0b8 100644
--- a/Sources/RegexBuilder/Match.swift
+++ b/Sources/RegexBuilder/Match.swift
@@ -12,17 +12,29 @@
 import _StringProcessing
 
 extension String {
-  public func match<R: RegexComponent>(
+  public func matchWhole<R: RegexComponent>(
     @RegexComponentBuilder _ content: () -> R
   ) -> Regex<R.Output>.Match? {
-    match(content())
+    matchWhole(content())
+  }
+
+  public func matchPrefix<R: RegexComponent>(
+    @RegexComponentBuilder _ content: () -> R
+  ) -> Regex<R.Output>.Match? {
+    matchPrefix(content())
   }
 }
 
 extension Substring {
-  public func match<R: RegexComponent>(
+  public func matchWhole<R: RegexComponent>(
+    @RegexComponentBuilder _ content: () -> R
+  ) -> Regex<R.Output>.Match? {
+    matchWhole(content())
+  }
+
+  public func matchPrefix<R: RegexComponent>(
     @RegexComponentBuilder _ content: () -> R
   ) -> Regex<R.Output>.Match? {
-    match(content())
+    matchPrefix(content())
   }
 }
diff --git a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift
index 4dac3cef5..3ab1e579d 100644
--- a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift
+++ b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift
@@ -24,7 +24,7 @@ extension RegexConsumer {
   func _matchingConsuming(
     _ consumed: Substring, in range: Range<String.Index>
   ) -> (upperBound: String.Index, match: Match)? {
-    guard let result = regex._match(
+    guard let result = try! regex._match(
       consumed.base,
       in: range, mode: .partialFromFront
     ) else { return nil }
diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift
index 2f99470fc..cac0e46c3 100644
--- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift
+++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift
@@ -12,7 +12,18 @@
 import _RegexParser
 
 extension Regex where Output == AnyRegexOutput {
-  public init(_ pattern: String) throws {
+  /// Parse and compile `pattern`, resulting in an existentially-typed capture list.
+  public init(compiling pattern: String) throws {
+    self.init(ast: try parse(pattern, .traditional))
+  }
+}
+
+extension Regex {
+  /// Parse and compile `pattern`, resulting in a strongly-typed capture list.
+  public init(
+    compiling pattern: String,
+    as: Output.Type = Output.self
+  ) throws {
     self.init(ast: try parse(pattern, .traditional))
   }
 }
diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift
index 29d18ef22..265a7868c 100644
--- a/Sources/_StringProcessing/Regex/Core.swift
+++ b/Sources/_StringProcessing/Regex/Core.swift
@@ -18,8 +18,48 @@ public protocol RegexComponent {
   var regex: Regex<Output> { get }
 }
 
-/// A regular expression.
+/// A regex represents a string processing algorithm.
+///
+///     let regex = try Regex(compiling: "a(.*)b")
+///     let match = "cbaxb".firstMatch(of: regex)
+///     print(match.0) // "axb"
+///     print(match.1) // "x"
+///
 public struct Regex<Output>: RegexComponent {
+  let program: Program
+
+  var hasCapture: Bool {
+    program.tree.hasCapture
+  }
+
+  init(ast: AST) {
+    self.program = Program(ast: ast)
+  }
+  init(ast: AST.Node) {
+    self.program = Program(ast: .init(ast, globalOptions: nil))
+  }
+
+  // Compiler interface. Do not change independently.
+  @usableFromInline
+  init(_regexString pattern: String) {
+    self.init(ast: try! parse(pattern, .traditional))
+  }
+
+  // Compiler interface. Do not change independently.
+  @usableFromInline
+  init(_regexString pattern: String, version: Int) {
+    assert(version == currentRegexLiteralFormatVersion)
+    // The version argument is passed by the compiler using the value defined
+    // in libswiftParseRegexLiteral.
+    self.init(ast: try! parseWithDelimiters(pattern))
+  }
+
+  public var regex: Regex<Output> {
+    self
+  }
+}
+
+extension Regex {
   /// A program representation that caches any lowered representation for
   /// execution.
   internal class Program {
@@ -41,49 +81,19 @@ public struct Regex<Output>: RegexComponent {
       self.tree = tree
     }
   }
+}
 
-  let program: Program
-//  var ast: AST { program.ast }
-
+extension Regex {
   @_spi(RegexBuilder)
   public var root: DSLTree.Node {
     program.tree.root
   }
 
-  var hasCapture: Bool {
-    program.tree.hasCapture
-  }
-
-  init(ast: AST) {
-    self.program = Program(ast: ast)
-  }
-  init(ast: AST.Node) {
-    self.program = Program(ast: .init(ast, globalOptions: nil))
-  }
-
   @_spi(RegexBuilder)
   public init(node: DSLTree.Node) {
     self.program = Program(tree: .init(node, options: nil))
   }
 
-  // Compiler interface. Do not change independently.
-  @usableFromInline
-  init(_regexString pattern: String) {
-    self.init(ast: try! parse(pattern, .traditional))
-  }
-
-  // Compiler interface. Do not change independently.
-  @usableFromInline
-  init(_regexString pattern: String, version: Int) {
-    assert(version == currentRegexLiteralFormatVersion)
-    // The version argument is passed by the compiler using the value defined
-    // in libswiftParseRegexLiteral.
-    self.init(ast: try! parseWithDelimiters(pattern))
-  }
-
-  public var regex: Regex<Output> {
-    self
-  }
 }
 
 // MARK: - Primitive regex components
diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift
index f3776b761..45d33f03e 100644
--- a/Sources/_StringProcessing/Regex/Match.swift
+++ b/Sources/_StringProcessing/Regex/Match.swift
@@ -10,11 +10,19 @@
 //===----------------------------------------------------------------------===//
 
 extension Regex {
+  /// The result of matching a regex against a string.
+  ///
+  /// A `Match` forwards API to the `Output` generic parameter,
+  /// providing direct access to captures.
   @dynamicMemberLookup
   public struct Match {
     let input: String
+
+    /// The range of the overall match
     public let range: Range<String.Index>
+
     let rawCaptures: [StructuredCapture]
+
     let referencedCaptureOffsets: [ReferenceID: Int]
 
     let value: Any?
@@ -22,6 +30,7 @@ extension Regex {
 }
 
 extension Regex.Match {
+  /// The produced output from the match operation
   public var output: Output {
     if Output.self == AnyRegexOutput.self {
       let wholeMatchAsCapture = StructuredCapture(
@@ -48,6 +57,7 @@ extension Regex.Match {
     }
   }
 
+  /// Lookup a capture by name or number
   public subscript<T>(dynamicMember keyPath: KeyPath<Output, T>) -> T {
     output[keyPath: keyPath]
   }
@@ -72,36 +82,89 @@ extension Regex.Match {
 }
 
 extension RegexComponent {
-  public func match(in input: String) -> Regex<Output>.Match? {
-    _match(
-      input, in: input.startIndex..<input.endIndex)
+  /// Match a string in its entirety.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchWhole(_ s: String) throws -> Regex<Output>.Match? {
+    try _match(s, in: s.startIndex..<s.endIndex, mode: .wholeString)
+  }
+
+  /// Match part of the string, starting at the beginning.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchPrefix(_ s: String) throws -> Regex<Output>.Match? {
+    try _match(s, in: s.startIndex..<s.endIndex, mode: .partialFromFront)
   }
-  public func match(in input: Substring) -> Regex<Output>.Match? {
-    _match(
-      input.base, in: input.startIndex..<input.endIndex)
+
+  /// Find the first match in a string
+  ///
+  /// Returns `nil` if no match is found and throws on abort
+  public func firstMatch(in s: String) throws -> Regex<Output>.Match? {
+    try _firstMatch(s, in: s.startIndex..<s.endIndex)
+  }
+
+  /// Match a substring in its entirety.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchWhole(_ s: Substring) throws -> Regex<Output>.Match? {
+    try _match(s.base, in: s.startIndex..<s.endIndex, mode: .wholeString)
+  }
+
+  /// Match part of the string, starting at the beginning.
+  ///
+  /// Returns `nil` if no match and throws on abort
+  public func matchPrefix(_ s: Substring) throws -> Regex<Output>.Match? {
+    try _match(s.base, in: s.startIndex..<s.endIndex, mode: .partialFromFront)
+  }
+
+  /// Find the first match in a substring
+  ///
+  /// Returns `nil` if no match is found and throws on abort
+  public func firstMatch(_ s: Substring) throws -> Regex<Output>.Match? {
+    try _firstMatch(s.base, in: s.startIndex..<s.endIndex)
   }
 
   func _match(
     _ input: String,
     in inputRange: Range<String.Index>,
     mode: MatchMode = .wholeString
-  ) -> Regex<Output>.Match? {
+  ) throws -> Regex<Output>.Match? {
     let executor = Executor(program: regex.program.loweredProgram)
-    do {
       return try executor.match(input, in: inputRange, mode)
-    } catch {
-      fatalError(String(describing: error))
+  }
+
+  func _firstMatch(
+    _ input: String,
+    in inputRange: Range<String.Index>
+  ) throws -> Regex<Output>.Match? {
+    // FIXME: Something more efficient, likely an engine interface, and we
+    // should scrap the RegexConsumer crap and call this
+
+    var low = inputRange.lowerBound
+    let high = inputRange.upperBound
+    while low < high {
+      if let m = try _match(input, in: low..<high, mode: .partialFromFront) {
+        return m
+      }
+      input.formIndex(after: &low)
     }
+    return nil
   }
 }
 
 extension String {
-  public func match<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
-    regex.match(in: self)
+  public func matchWhole<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
+    try? regex.matchWhole(self)
+  }
+  public func matchPrefix<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
+    try? regex.matchPrefix(self)
   }
 }
 extension Substring {
-  public func match<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
-    regex.match(in: self)
+  public func matchWhole<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
+    try? regex.matchWhole(self)
+  }
+  public func matchPrefix<R: RegexComponent>(_ regex: R) -> Regex<R.Output>.Match? {
+    try? regex.matchPrefix(self)
   }
 }
diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift
index b405a5399..7be95c28c 100644
--- a/Tests/RegexBuilderTests/CustomTests.swift
+++ b/Tests/RegexBuilderTests/CustomTests.swift
@@ -62,7 +62,7 @@ func customTest<Match: Equatable>(
     let result: Match?
     switch call {
     case .match:
-      result = input.match(regex)?.output
+      result = input.matchWhole(regex)?.output
     case .firstMatch:
       result = input.firstMatch(of: regex)?.result
     }
diff --git a/Tests/RegexBuilderTests/MotivationTests.swift b/Tests/RegexBuilderTests/MotivationTests.swift
new file mode 100644
index 000000000..882ba6448
--- /dev/null
+++ b/Tests/RegexBuilderTests/MotivationTests.swift
@@ -0,0 +1,267 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: macOS CI seems to be busted and Linux doesn't have FormatStyle
+// So, we disable this file for now
+
+#if false
+
+import _MatchingEngine
+
+import XCTest
+import _StringProcessing
+
+import RegexBuilder
+
+private struct Transaction: Hashable {
+  enum Kind: Hashable {
+    case credit
+    case debit
+
+    init?(_ s: Substring) {
+      switch s.lowercased() {
+      case "credit": self = .credit
+      case "debit": self = .debit
+      default: return nil
+      }
+    }
+  }
+
+  var kind: Kind
+  var date: Date
+  var account: String
+  var amount: Decimal
+}
+extension Transaction: CustomStringConvertible {
+  var description: String {
+    """
+      kind: \(kind)
+      date: \(date)
+      account: \(account)
+      amount: \(amount)
+    """
+  }
+}
+
+private struct Statement {
+  var entries: [Transaction]
+  init<S: Sequence>(_ entries: S) where S.Element == Transaction {
+    self.entries = Array(entries)
+  }
+}
+
+// In contrast to unit tests, or small functional tests, these
+// test full workloads or perform real(ish) tasks.
+//
+// TODO: Consider adapting into Exercises or benchmark target...
+
+private let statement = """
+CREDIT    03/02/2022    Payroll                   $200.23
+CREDIT    03/03/2022    Sanctioned Individual A   $2,000,000.00
+DEBIT     03/03/2022    Totally Legit Shell Corp  $2,000,000.00
+DEBIT     03/05/2022    Beanie Babies Are Back    $57.33
+"""
+
+private func processEntry(_ s: String) -> Transaction? {
+  var slice = s[...]
+  guard let kindEndIdx = slice.firstIndex(of: " "),
+        let kind = Transaction.Kind(slice[..<kindEndIdx])
+  else {
+    return nil
+  }
+
+  slice = slice[kindEndIdx...].drop(while: \.isWhitespace)
+  let formatter = DateFormatter()
+  formatter.dateStyle = .short
+  guard let dateEndIdx = slice.firstIndex(of: " "),
+        let date = formatter.date(from: String(slice[..<dateEndIdx]))
+  else {
+    return nil
+  }
+  slice = slice[dateEndIdx...].drop(while: \.isWhitespace)
+
+  // Account can have spaces, look for 2-or-more for end-of-field
+  // ...
+  // You know what, let's just bail and call it a day
+  _ = (kind, date)
+  return nil
+}
+
+let pattern = #"(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)+)\s\s+(.*)"#
+
+@available(macOS 12.0, *)
+private func processWithNSRegularExpression(_ line: String) -> Transaction? {
+  let nsRegEx = try! NSRegularExpression(pattern: pattern)
+
+  let range = NSRange(line.startIndex..<line.endIndex, in: line)
+  guard let result = nsRegEx.firstMatch(in: line, range: range) else {
+    return nil
+  }
+
+  guard let kindRange = Range(result.range(at: 1), in: line),
+        let kind = Transaction.Kind(line[kindRange])
+  else {
+    return nil
+  }
+
+  let dateStrat = Date.FormatStyle(date: .numeric).parseStrategy
+  guard let dateRange = Range(result.range(at: 2), in: line),
+        let date = try? Date(String(line[dateRange]), strategy: dateStrat)
+  else {
+    return nil
+  }
+
+  guard let accountRange = Range(result.range(at: 3), in: line) else {
+    return nil
+  }
+  let account = String(line[accountRange])
+
+  guard let amountRange = Range(result.range(at: 4), in: line),
+        let amount = try? Decimal(
+          String(line[amountRange]), format: .currency(code: "USD"))
+  else {
+    return nil
+  }
+
+  return Transaction(
+    kind: kind, date: date, account: account, amount: amount)
+}
+
+private func processWithRuntimeDynamicRegex(
+  _ line: String
+) -> Transaction? {
+  // FIXME: Shouldn't this init throw?
+  let regex = try! Regex(compiling: pattern)
+
+//      guard let result = line.match(regex) else { return nil }
+//
+//      // TODO: We should have Regex<DynamicCaptures> or somesuch and `.1`
+//      // should be the same as `\1`.
+//      let dynCaps = result.1
+//
+//
+//      let kind = Transaction.Kind(result.1.first!.capture as Substring)
+
+  return nil
+}
+
+@available(macOS 12.0, *)
+private func processWithRuntimeStaticRegex(_ line: String) -> Transaction? {
+  let regex: Regex<(Substring, Substring, Substring, Substring, Substring)>
+  = try! Regex(compiling: pattern)
+
+  return process(line, using: regex)
+}
+
+@available(macOS 12.0, *)
+private func processWithDSL(_ line: String) -> Transaction? {
+  let fieldSeparator = Regex {
+    CharacterClass.whitespace
+    OneOrMore(.whitespace)
+  }
+
+  let regex = Regex {
+    Capture(OneOrMore(.word))
+    fieldSeparator
+
+    Capture(OneOrMore(.whitespace.inverted))
+    fieldSeparator
+
+    Capture {
+      OneOrMore {
+        Lookahead(
+          // FIXME: `fieldSeparator` differs, why?
+          Regex {
+            CharacterClass.whitespace
+            CharacterClass.whitespace
+          }, negative: true)
+        CharacterClass.any
+      }
+    }
+    fieldSeparator
+
+    Capture { OneOrMore(.any) }
+  }
+
+  return process(line, using: regex)
+}
+
+@available(macOS 12.0, *)
+private func process(
+  _ line: String,
+  using regex: Regex<(Substring, Substring, Substring, Substring, Substring)>
+) -> Transaction? {
+  guard let output = try? regex.matchWhole(line),
+        let kind = Transaction.Kind(output.1)
+  else {
+    return nil
+  }
+
+  let dateStrat = Date.FormatStyle(date: .numeric).parseStrategy
+  guard let date = try? Date(String(output.2), strategy: dateStrat) else {
+    return nil
+  }
+
+  let account = String(output.3)
+
+  guard let amount = try? Decimal(
+    String(output.4), format: .currency(code: "USD")
+  ) else {
+    return nil
+  }
+
+  return Transaction(
+    kind: kind, date: date, account: account, amount: amount)
+}
+
+extension RegexDSLTests {
+
+  // TODO: FormatStyle not available on Linux...
+  @available(macOS 12.0, *)
+  func testBankStatement() {
+    // TODO: Stop printing and start testing...
+
+    for line in statement.split(separator: "\n") {
+      let line = String(line)
+      _ = processEntry(line)
+
+      // NSRegularExpression
+      let referenceOutput = processWithNSRegularExpression(line)!
+
+      XCTAssertEqual(
+        referenceOutput, processWithNSRegularExpression(line))
+
+      _ = processWithRuntimeDynamicRegex(line)
+
+      // Static run-time regex
+      XCTAssertEqual(
+        referenceOutput, processWithRuntimeStaticRegex(line))
+
+      // DSL
+      let dslOut = processWithDSL(line)!
+      guard referenceOutput == dslOut else {
+        if referenceOutput.account != dslOut.account {
+          // FIXME: Bug in lookahead here?
+          continue
+        }
+
+        XCTFail()
+        continue
+      }
+
+    }
+
+  }
+
+}
+
+#endif
+
diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift
index 93a1dda65..50358734d 100644
--- a/Tests/RegexBuilderTests/RegexDSLTests.swift
+++ b/Tests/RegexBuilderTests/RegexDSLTests.swift
@@ -24,7 +24,7 @@ class RegexDSLTests: XCTestCase {
   ) throws {
     let regex = content()
     for (input, maybeExpectedCaptures) in tests {
-      let maybeMatch = input.match(regex)
+      let maybeMatch = input.matchWhole(regex)
       if let expectedCaptures = maybeExpectedCaptures {
         let match = try XCTUnwrap(maybeMatch, file: file, line: line)
         XCTAssertTrue(
@@ -52,12 +52,12 @@ class RegexDSLTests: XCTestCase {
     }
     // Assert the inferred capture type.
     let _: (Substring, Substring, Int).Type = type(of: regex).Output.self
-    let maybeMatch = "ab1".match(regex)
+    let maybeMatch = "ab1".matchWhole(regex)
     let match = try XCTUnwrap(maybeMatch)
     XCTAssertTrue(match.output == ("ab1", "b", 1))
 
     let substring = "ab1"[...]
-    let substringMatch = try XCTUnwrap(substring.match(regex))
+    let substringMatch = try XCTUnwrap(substring.matchWhole(regex))
     XCTAssertTrue(match.output == substringMatch.output)
   }
 
@@ -73,7 +73,7 @@ class RegexDSLTests: XCTestCase {
   }
 
   func testMatchResultDotZeroWithoutCapture() throws {
-    let match = try XCTUnwrap("aaa".match { OneOrMore { "a" } })
+    let match = try XCTUnwrap("aaa".matchWhole { OneOrMore { "a" } })
     XCTAssertEqual(match.0, "aaa")
   }
 
@@ -82,8 +82,8 @@ class RegexDSLTests: XCTestCase {
       let regex = ChoiceOf {
         "aaa"
       }
-      XCTAssertTrue("aaa".match(regex)?.output == "aaa")
-      XCTAssertNil("aab".match(regex)?.output)
+      XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa")
+      XCTAssertNil("aab".matchWhole(regex)?.output)
     }
     do {
       let regex = ChoiceOf {
@@ -91,10 +91,10 @@ class RegexDSLTests: XCTestCase {
         "bbb"
         "ccc"
       }
-      XCTAssertTrue("aaa".match(regex)?.output == "aaa")
-      XCTAssertNil("aab".match(regex)?.output)
-      XCTAssertTrue("bbb".match(regex)?.output == "bbb")
-      XCTAssertTrue("ccc".match(regex)?.output == "ccc")
+      XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa")
+      XCTAssertNil("aab".matchWhole(regex)?.output)
+      XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb")
+      XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc")
     }
     do {
       let regex = Regex {
@@ -109,7 +109,7 @@ class RegexDSLTests: XCTestCase {
         }
       }
       XCTAssertTrue(
-        try XCTUnwrap("abc".match(regex)?.output) == ("abc", "c"))
+        try XCTUnwrap("abc".matchWhole(regex)?.output) == ("abc", "c"))
     }
     do {
       let regex = ChoiceOf {
@@ -117,18 +117,18 @@ class RegexDSLTests: XCTestCase {
         "bbb"
         "ccc"
       }
-      XCTAssertTrue("aaa".match(regex)?.output == "aaa")
-      XCTAssertNil("aab".match(regex)?.output)
-      XCTAssertTrue("bbb".match(regex)?.output == "bbb")
-      XCTAssertTrue("ccc".match(regex)?.output == "ccc")
+      XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa")
+      XCTAssertNil("aab".matchWhole(regex)?.output)
+      XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb")
+      XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc")
     }
     do {
       let regex = ChoiceOf {
         Capture("aaa")
       }
       XCTAssertTrue(
-        try XCTUnwrap("aaa".match(regex)?.output) == ("aaa", "aaa"))
-      XCTAssertNil("aab".match(regex)?.output)
+        try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa"))
+      XCTAssertNil("aab".matchWhole(regex)?.output)
     }
     do {
       let regex = ChoiceOf {
@@ -137,12 +137,12 @@ class RegexDSLTests: XCTestCase {
         Capture("ccc")
       }
       XCTAssertTrue(
-        try XCTUnwrap("aaa".match(regex)?.output) == ("aaa", "aaa", nil, nil))
+        try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa", nil, nil))
       XCTAssertTrue(
-        try XCTUnwrap("bbb".match(regex)?.output) == ("bbb", nil, "bbb", nil))
+        try XCTUnwrap("bbb".matchWhole(regex)?.output) == ("bbb", nil, "bbb", nil))
       XCTAssertTrue(
-        try XCTUnwrap("ccc".match(regex)?.output) == ("ccc", nil, nil, "ccc"))
-      XCTAssertNil("aab".match(regex)?.output)
+        try XCTUnwrap("ccc".matchWhole(regex)?.output) == ("ccc", nil, nil, "ccc"))
+      XCTAssertNil("aab".matchWhole(regex)?.output)
     }
   }
 
@@ -342,7 +342,7 @@ class RegexDSLTests: XCTestCase {
     // Assert the inferred capture type.
     let _: Substring.Type = type(of: regex).Output.self
     let input = "123123"
-    let match = try XCTUnwrap(input.match(regex)?.output)
+    let match = try XCTUnwrap(input.matchWhole(regex)?.output)
     XCTAssertTrue(match == input)
   }
 
@@ -469,7 +469,7 @@ class RegexDSLTests: XCTestCase {
 
     let unicodeLine =
       "1BCA0..1BCA3  ; Control # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP"
-    let match = try XCTUnwrap(unicodeLine.match(unicodeData))
+    let match = try XCTUnwrap(unicodeLine.matchWhole(unicodeData))
     XCTAssertEqual(match.0, Substring(unicodeLine))
     XCTAssertEqual(match.1, "Control")
   }
@@ -501,7 +501,7 @@ class RegexDSLTests: XCTestCase {
         Substring, Unicode.Scalar?, Unicode.Scalar??, Substring
       )
       let _: ExpectedMatch.Type = type(of: regexWithCapture).Output.self
-      let maybeMatchResult = line.match(regexWithCapture)
+      let maybeMatchResult = line.matchWhole(regexWithCapture)
       let matchResult = try XCTUnwrap(maybeMatchResult)
       let (wholeMatch, lower, upper, propertyString) = matchResult.output
       XCTAssertEqual(wholeMatch, Substring(line))
@@ -536,7 +536,7 @@ class RegexDSLTests: XCTestCase {
         Substring, Unicode.Scalar, Unicode.Scalar?, Substring
       )
       let _: ExpectedMatch.Type = type(of: regexWithTryCapture).Output.self
-      let maybeMatchResult = line.match(regexWithTryCapture)
+      let maybeMatchResult = line.matchWhole(regexWithTryCapture)
       let matchResult = try XCTUnwrap(maybeMatchResult)
       let (wholeMatch, lower, upper, propertyString) = matchResult.output
       XCTAssertEqual(wholeMatch, Substring(line))
@@ -549,7 +549,7 @@ class RegexDSLTests: XCTestCase {
       let regexLiteral = try MockRegexLiteral(
         #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#,
         matching: (Substring, Substring, Substring?, Substring).self)
-      let maybeMatchResult = line.match(regexLiteral)
+      let maybeMatchResult = line.matchWhole(regexLiteral)
       let matchResult = try XCTUnwrap(maybeMatchResult)
       let (wholeMatch, lower, upper, propertyString) = matchResult.output
       XCTAssertEqual(wholeMatch, Substring(line))
@@ -561,21 +561,21 @@ class RegexDSLTests: XCTestCase {
 
   func testDynamicCaptures() throws {
     do {
-      let regex = try Regex("aabcc.")
+      let regex = try Regex(compiling: "aabcc.")
       let line = "aabccd"
-      let match = try XCTUnwrap(line.match(regex))
+      let match = try XCTUnwrap(line.matchWhole(regex))
       XCTAssertEqual(match.0, line[...])
       let output = match.output
       XCTAssertEqual(output[0].substring, line[...])
     }
     do {
       let regex = try Regex(
-        #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#)
+        compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#)
       let line = """
         A6F0..A6F1    ; Extend # Mn   [2] BAMUM COMBINING MARK KOQNDON..BAMUM \
         COMBINING MARK TUKWENTIS
         """
-      let match = try XCTUnwrap(line.match(regex))
+      let match = try XCTUnwrap(line.matchWhole(regex))
       XCTAssertEqual(match.0, line[...])
       let output = match.output
       XCTAssertEqual(output[0].substring, line[...])
@@ -640,7 +640,7 @@ class RegexDSLTests: XCTestCase {
         }
       }
       let input = "abc#41#42abc#42#42"
-      let result = try XCTUnwrap(input.match(regex))
+      let result = try XCTUnwrap(input.matchWhole(regex))
       XCTAssertEqual(result[a], "abc")
       XCTAssertEqual(result[b], 42)
     }
@@ -720,7 +720,7 @@ class RegexDSLTests: XCTestCase {
     
     let parser = SemanticVersionParser()
     for (str, version) in versions {
-      XCTAssertEqual(str.match(parser)?.output, version)
+      XCTAssertEqual(str.matchWhole(parser)?.output, version)
     }
   }
 }
diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift
index b51f12100..bc80746c6 100644
--- a/Tests/RegexTests/AlgorithmsTests.swift
+++ b/Tests/RegexTests/AlgorithmsTests.swift
@@ -32,7 +32,7 @@ class RegexConsumerTests: XCTestCase {
       _ expected: [Range<Int>],
       file: StaticString = #file, line: UInt = #line
     ) {
-      let regex = try! Regex(regex)
+      let regex = try! Regex(compiling: regex)
       
       let actualSeq: [Range<Int>] = string[...].ranges(of: regex).map {
         let start = string.offset(ofIndex: $0.lowerBound)
@@ -77,7 +77,7 @@ class RegexConsumerTests: XCTestCase {
       _ expected: [Substring],
       file: StaticString = #file, line: UInt = #line
     ) {
-      let regex = try! Regex(regex)
+      let regex = try! Regex(compiling: regex)
       let actual = Array(string.split(by: regex))
       XCTAssertEqual(actual, expected, file: file, line: line)
     }
@@ -97,7 +97,7 @@ class RegexConsumerTests: XCTestCase {
       _ expected: String,
       file: StaticString = #file, line: UInt = #line
     ) {
-      let regex = try! Regex(regex)
+      let regex = try! Regex(compiling: regex)
       let actual = string.replacing(regex, with: replacement)
       XCTAssertEqual(actual, expected, file: file, line: line)
     }
@@ -116,7 +116,7 @@ class RegexConsumerTests: XCTestCase {
   }
 
   func testAdHoc() {
-    let r = try! Regex("a|b+")
+    let r = try! Regex(compiling: "a|b+")
 
     XCTAssert("palindrome".contains(r))
     XCTAssert("botany".contains(r))

From 43a78e81762c74a10da091df56da7b7d4179c5f5 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Fri, 1 Apr 2022 23:41:31 +0100
Subject: [PATCH 16/17] Update escaping rules in RegexSyntax.md

Tweak the text to say that e.g `\I` is invalid.
---
 Documentation/Evolution/RegexSyntax.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md
index 958d52f7d..2b86f99f7 100644
--- a/Documentation/Evolution/RegexSyntax.md
+++ b/Documentation/Evolution/RegexSyntax.md
@@ -160,7 +160,7 @@ Atom -> Anchor
       | '\'? <Character>
 ```
 
-Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`.
+Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid.
 
 #### Anchors
 

From 2aa67f8f813e835d47f8ef614add8689247a8ec0 Mon Sep 17 00:00:00 2001
From: Hamish Knight <hamish_github@mediocremail.com>
Date: Mon, 4 Apr 2022 10:49:51 +0100
Subject: [PATCH 17/17] Update Documentation/Evolution/RegexSyntax.md

Co-authored-by: Michael Ilseman <michael.ilseman@gmail.com>
---
 Documentation/Evolution/RegexSyntax.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md
index 2b86f99f7..faa327176 100644
--- a/Documentation/Evolution/RegexSyntax.md
+++ b/Documentation/Evolution/RegexSyntax.md
@@ -160,7 +160,7 @@ Atom -> Anchor
       | '\'? <Character>
 ```
 
-Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid.
+Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid and would produce an error.
 
 #### Anchors