From 8e3d9ac6b64dd78697eb6cb0fd8a2bf0acef26ce Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Tue, 12 Apr 2022 07:53:27 -0500
Subject: [PATCH 01/13] Add tests for UTS18 level support

---
 Package.swift                     |   4 +-
 Tests/RegexTests/UTS18Tests.swift | 404 ++++++++++++++++++++++++++++++
 2 files changed, 407 insertions(+), 1 deletion(-)
 create mode 100644 Tests/RegexTests/UTS18Tests.swift
diff --git a/Package.swift b/Package.swift
index 47a73ca72..2728b0090 100644
--- a/Package.swift
+++ b/Package.swift
@@ -55,7 +55,9 @@ let package = Package(
             ]),
         .testTarget(
             name: "RegexTests",
-            dependencies: ["_StringProcessing"]),
+            dependencies: ["_StringProcessing"],
+            swiftSettings: [.unsafeFlags(["-Xfrontend", "-enable-experimental-string-processing"])]
+        ),
         .testTarget(
             name: "RegexBuilderTests",
             dependencies: ["_StringProcessing", "RegexBuilder"],
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
new file mode 100644
index 000000000..97ba156c4
--- /dev/null
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -0,0 +1,404 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+//
+//===----------------------------------------------------------------------===//
+
+import XCTest
+@testable import _StringProcessing
+
+class UTS18Tests: XCTestCase {
+  var input: String {
+    "ABCdefghîøü\u{FFF0} -–—[]123"
+  // 012345678901       234567890
+  }
+
+}
+
+fileprivate extension String {
+  subscript<R: RangeExpression>(pos bounds: R) -> Substring
+    where R.Bound == Int
+  {
+    let bounds = bounds.relative(to: 0..<count)
+    return dropFirst(bounds.lowerBound).prefix(bounds.count)
+  }
+}
+
+fileprivate func expectFirstMatch<Output: Equatable>(
+  _ input: String,
+  _ r: Regex<Output>,
+  _ output: Output,
+  file: StaticString = #file,
+  line: UInt = #line)
+{
+  XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line)
+}
+
+#if os(Linux)
+func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {}
+#endif
+
+// MARK: - Basic Unicode Support: Level 1
+
+// C1. An implementation claiming conformance to Level 1 of this specification
+// shall meet the requirements described in the following sections:
+extension UTS18Tests {
+  // RL1.1 Hex Notation
+  //
+  // To meet this requirement, an implementation shall supply a mechanism for
+  // specifying any Unicode code point (from U+0000 to U+10FFFF), using the
+  // hexadecimal code point representation.
+  func testHexNotation() throws {
+    expectFirstMatch("ab", #/\u{61}\u{62}/#, "ab")
+    expectFirstMatch("𝄞", #/\u{1D11E}/#, "𝄞")
+  }
+  
+  // 1.1.1 Hex Notation and Normalization
+  //
+  // TODO: Does this section make a recommendation?
+  
+  // RL1.2	Properties
+  // To meet this requirement, an implementation shall provide at least a
+  // minimal list of properties, consisting of the following:
+  // - General_Category
+  // - Script and Script_Extensions
+  // - Alphabetic
+  // - Uppercase
+  // - Lowercase
+  // - White_Space
+  // - Noncharacter_Code_Point
+  // - Default_Ignorable_Code_Point
+  // - ANY, ASCII, ASSIGNED
+  // The values for these properties must follow the Unicode definitions, and
+  // include the property and property value aliases from the UCD. Matching of
+  // Binary, Enumerated, Catalog, and Name values must follow the Matching
+  // Rules from [UAX44] with one exception: implementations are not required
+  // to ignore an initial prefix string of "is" in property values.
+  func testProperties() throws {
+    // General_Category
+    expectFirstMatch(input, #/\p{Lu}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{lu}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{uppercase letter}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{Uppercase Letter}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{Uppercase_Letter}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{uppercaseletter}+/#, input[pos: ..<3])
+    
+    expectFirstMatch(input, #/\p{P}+/#, "-–—[]")
+    expectFirstMatch(input, #/\p{Pd}+/#, "-–—")
+    
+    expectFirstMatch(input, #/\p{Any}+/#, input[...])
+    expectFirstMatch(input, #/\p{Assigned}+/#, input[pos: ..<11])
+    expectFirstMatch(input, #/\p{ASCII}+/#, input[pos: ..<8])
+    
+    // Script and Script_Extensions
+    //    U+3042  あ  HIRAGANA LETTER A  Hira  {Hira}
+    XCTAssertTrue("\u{3042}".contains(#/\p{Hira}/#))
+    XCTAssertTrue("\u{3042}".contains(#/\p{sc=Hira}/#))
+    XCTAssertTrue("\u{3042}".contains(#/\p{scx=Hira}/#))
+    //    U+30FC  ー  KATAKANA-HIRAGANA PROLONGED SOUND MARK  Zyyy = Common  {Hira, Kana}
+    XCTAssertTrue("\u{30FC}".contains(#/\p{Hira}/#))      // Implicit = Script_Extensions
+    XCTAssertTrue("\u{30FC}".contains(#/\p{Kana}/#))
+    XCTAssertTrue("\u{30FC}".contains(#/\p{sc=Zyyy}/#))   // Explicit = Script
+    XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Hira}/#))
+    XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Kana}/#))
+    XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Hira}/#))
+    XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Kana}/#))
+    
+    // Uppercase, etc
+    expectFirstMatch(input, #/\p{Uppercase}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{isUppercase}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{Uppercase=true}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{is Uppercase}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{is uppercase = true}+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/\p{lowercase}+/#, input[pos: 3..<11])
+    expectFirstMatch(input, #/\p{whitespace}+/#, input[pos: 12..<13])
+
+    // Block vs Writing System
+    let greekScalar = "Θ" // U+0398
+    let greekExtendedScalar = "ἀ" // U+1F00
+    XCTAssertTrue(greekScalar.contains(#/\p{Greek}/#))
+    XCTAssertTrue(greekExtendedScalar.contains(#/\p{Greek}/#))
+  }
+  
+  func testProperties_XFail() {
+    XCTExpectFailure("Need to support 'age' and 'block' properties") {
+      // XCTAssertFalse("z".contains(#/\p{age=3.1}/#))
+      XCTFail("\(#/\p{age=3.1}/#)")
+      // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#))
+      XCTFail("\(#/\p{Block=Greek}/#)")
+    }
+  }
+  
+  // RL1.2a	Compatibility Properties
+  // To meet this requirement, an implementation shall provide the properties
+  // listed in Annex C: Compatibility Properties, with the property values as
+  // listed there. Such an implementation shall document whether it is using
+  // the Standard Recommendation or POSIX-compatible properties.
+  func testCompatibilityProperties() throws {
+    // FIXME: These tests seem insufficient
+    expectFirstMatch(input, #/[[:alpha:]]+/#, input[pos: ..<11])
+    expectFirstMatch(input, #/[[:upper:]]+/#, input[pos: ..<3])
+    expectFirstMatch(input, #/[[:lower:]]+/#, input[pos: 3..<11])
+    expectFirstMatch(input, #/[[:punct:]]+/#, input[pos: 13..<18])
+    expectFirstMatch(input, #/[[:digit:]]+/#, input[pos: 18..<21])
+    expectFirstMatch(input, #/[[:xdigit:]]+/#, input[pos: ..<6])
+    expectFirstMatch(input, #/[[:alnum:]]+/#, input[pos: ..<11])
+    expectFirstMatch(input, #/[[:space:]]+/#, input[pos: 12..<13])
+    // TODO: blank
+    // TODO: cntrl
+    expectFirstMatch(input, #/[[:graph:]]+/#, input[pos: ..<11])
+    expectFirstMatch(input, #/[[:print:]]+/#, input[...])
+    expectFirstMatch(input, #/[[:word:]]+/#, input[pos: ..<11])
+  }
+  
+  //RL1.3 Subtraction and Intersection
+  //
+  // To meet this requirement, an implementation shall supply mechanisms for
+  // union, intersection and set-difference of sets of characters within
+  // regular expression character class expressions.
+  func testSubtractionAndIntersection() {
+    // Non-ASCII letters
+    expectFirstMatch(input, #/[\p{Letter}--\p{ASCII}]+/#, input[pos: 8..<11])
+    // Digits that aren't 1 or 2
+    expectFirstMatch(input, #/[\p{digit}--[12]]+/#, input[pos: 20..<21])
+    
+    // ASCII-only letters
+    expectFirstMatch(input, #/[\p{Letter}&&\p{ASCII}]+/#, input[pos: ..<8])
+    // Digits that are 2 or 3
+    expectFirstMatch(input, #/[\p{digit}&&[23]]+/#, input[pos: 19..<21])
+    
+    // Non-ASCII lowercase + non-lowercase ASCII
+    expectFirstMatch(input, #/[\p{lowercase}~~\p{ascii}]+/#, input[pos: ..<3])
+    XCTAssertTrue("123%&^ABC".contains(#/^[\p{lowercase}~~\p{ascii}]+$/#))
+  }
+  
+  func testSubtractionAndIntersectionPrecedence() {
+    expectFirstMatch("ABC123-", #/[[:alnum:]]*-/#, "ABC123-")
+    expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}]*-/#, "123-")
+    // Union binds more closely than difference
+    expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}[:digit:]]*-/#, "-")
+    // TODO: Test for intersection precedence
+  }
+  
+  // RL1.4 Simple Word Boundaries
+  // To meet this requirement, an implementation shall extend the word boundary
+  // mechanism so that:
+  // - The class of <word_character> includes all the Alphabetic values from the
+  //   Unicode character database, from UnicodeData.txt, plus the decimals
+  //   (General_Category=Decimal_Number, or equivalently Numeric_Type=Decimal),
+  //   and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER
+  //   (Join_Control=True). See also Annex C: Compatibility Properties.
+  // - Nonspacing marks are never divided from their base characters, and
+  //   otherwise ignored in locating boundaries.
+  func testSimpleWordBoundaries() {
+    let simpleWordRegex = #/.+?\b/#.usingUnicodeWordBoundaries(false)
+    expectFirstMatch(input, simpleWordRegex, input[pos: ..<11])
+    expectFirstMatch("don't", simpleWordRegex, "don")
+    expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café")
+  }
+  
+  // RL1.5 Simple Loose Matches
+  //
+  // To meet this requirement, if an implementation provides for case-
+  // insensitive matching, then it shall provide at least the simple, default
+  // Unicode case-insensitive matching, and specify which properties are closed
+  // and which are not.
+  //
+  // To meet this requirement, if an implementation provides for case
+  // conversions, then it shall provide at least the simple, default Unicode
+  // case folding.
+  func testSimpleLooseMatches() {
+    expectFirstMatch("Dåb", #/Dåb/#.ignoringCase(), "Dåb")
+    expectFirstMatch("dÅB", #/Dåb/#.ignoringCase(), "dÅB")
+    expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoringCase(), "D\u{212B}B")
+  }
+
+  func testSimpleLooseMatches_XFail() {
+    XCTExpectFailure("Need case folding support") {
+      let sigmas = "σΣς"
+      expectFirstMatch(sigmas, #/σ+/#.ignoringCase(), sigmas[...])
+      expectFirstMatch(sigmas, #/Σ+/#.ignoringCase(), sigmas[...])
+      expectFirstMatch(sigmas, #/ς+/#.ignoringCase(), sigmas[...])
+      
+      // TODO: Test German sharp S
+      // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]]
+    }
+  }
+  
+  // RL1.6 Line Boundaries
+  //
+  // To meet this requirement, if an implementation provides for line-boundary
+  // testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085),
+  // PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028).
+  func testLineBoundaries() {
+    let lineInput = """
+      01
+      02\r\
+      03\n\
+      04\u{a}\
+      05\u{b}\
+      06\u{c}\
+      07\u{d}\
+      08\u{d}\u{a}\
+      09\u{85}\
+      10\u{2028}\
+      11\u{2029}\
+      
+      """
+    // Check the input counts
+    var lines = lineInput.matches(of: #/\d{2}/#)
+    XCTAssertEqual(lines.count, 11)
+    // Test \R - newline sequence
+    lines = lineInput.matches(of: #/\d{2}\R/#)
+    XCTAssertEqual(lines.count, 11)
+    // Test anchors as line boundaries
+    lines = lineInput.matches(of: #/^\d{2}$/#.anchorsMatchLineEndings())
+    XCTAssertEqual(lines.count, 11)
+    // Test that dot does not match line endings
+    lines = lineInput.matches(of: #/.+/#)
+    XCTAssertEqual(lines.count, 11)
+    
+    // Does not contain an empty line
+    XCTAssertFalse(lineInput.contains(#/^$/#))
+    // Does contain an empty line (between \n and \r, which are reversed here)
+    let empty = "\n\r"
+    XCTAssertTrue(empty.contains(#/^$/#.anchorsMatchLineEndings()))
+  }
+  
+  // RL1.7 Supplementary Code Points
+  //
+  // To meet this requirement, an implementation shall handle the full range of
+  // Unicode code points, including values from U+FFFF to U+10FFFF. In
+  // particular, where UTF-16 is used, a sequence consisting of a leading
+  // surrogate followed by a trailing surrogate shall be handled as a single
+  // code point in matching.
+  func testSupplementaryCodePoints() {
+    XCTAssertTrue("👍".contains(#/\u{1F44D}/#))
+    XCTAssertTrue("👍".contains(#/[\u{1F440}-\u{1F44F}]/#))
+    XCTAssertTrue("👍👎".contains(#/^[\u{1F440}-\u{1F44F}]+$/#))
+  }
+}
+
+// MARK: - Extended Unicode Support: Level 2
+
+// C2.  An implementation claiming conformance to Level 2 of this specification
+// shall satisfy C1, and meet the requirements described in the following
+// sections:
+extension UTS18Tests {
+  // RL2.1 Canonical Equivalents
+  //
+  // Specific recommendation?
+  func testCanonicalEquivalents() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+  
+  // RL2.2 Extended Grapheme Clusters and Character Classes with Strings
+  //
+  // To meet this requirement, an implementation shall provide a mechanism for
+  // matching against an arbitrary extended grapheme cluster, Character Classes
+  // with Strings, and extended grapheme cluster boundaries.
+  func testExtendedGraphemeClusters() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+  
+  func testCharacterClassesWithStrings() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+  
+  // RL2.3 Default Word Boundaries
+  //
+  // To meet this requirement, an implementation shall provide a mechanism for
+  // matching Unicode default word boundaries.
+  func testDefaultWordBoundaries() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+
+  // RL2.4 Default Case Conversion
+  //
+  // To meet this requirement, if an implementation provides for case
+  // conversions, then it shall provide at least the full, default Unicode case
+  // folding.
+  func testDefaultCaseConversion() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+  
+  // RL2.5 Name Properties
+  //
+  // To meet this requirement, an implementation shall support individually
+  // named characters.
+  func testNameProperty_XFail() {
+    XCTExpectFailure("Need \\p{name=...} support") {
+      XCTFail("\(#/\p{name=BOM}/#)")
+      // Name property
+      // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#))
+      // Name property and Matching Rules
+      // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#))
+      // Name_Alias property
+      // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#))
+      // Name_Alias property (again)
+      // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#))
+
+      // Computed name
+      // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#))
+
+      // Control character
+      // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#))
+      // Graphic symbol
+      // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#))
+    }
+  }
+  
+  func testIndividuallyNamedCharacters() {
+    XCTAssertTrue("\u{263A}".contains(#/\N{WHITE SMILING FACE}/#))
+    XCTAssertTrue("\u{3B1}".contains(#/\N{GREEK SMALL LETTER ALPHA}/#))
+    XCTAssertTrue("\u{10450}".contains(#/\N{SHAVIAN LETTER PEEP}/#))
+    
+    XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#))
+    XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#))
+    XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#))
+  }
+
+  func testIndividuallyNamedCharacters_XFail() {
+    XCTExpectFailure("Need to support named chars in custom character classes") {
+      XCTFail("\(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)")
+      // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#))
+    }
+    
+    XCTExpectFailure("Other named char failures -- investigate") {
+      XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#))
+      XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#))
+      XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#))
+      XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#))
+      XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#))
+      XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#))
+    }
+    
+    XCTExpectFailure("Need to recognize invalid names at compile time") {
+      XCTFail("This should be a compilation error, not a match failure:")
+      XCTAssertFalse("abc".contains(#/\N{NOT AN ACTUAL CHARACTER NAME}/#))
+    }
+  }
+
+  // RL2.6 Wildcards in Property Values
+  //
+  // To meet this requirement, an implementation shall support wildcards in
+  // Unicode property values.
+  func testWildcardsInPropertyValues() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+  
+  // RL2.7 Full Properties
+  //
+  // To meet this requirement, an implementation shall support all of the
+  // properties listed below that are in the supported version of the Unicode
+  // Standard (or Unicode Technical Standard, respectively), with values that
+  // match the Unicode definitions for that version.
+  func testFullProperties() {
+    XCTExpectFailure { XCTFail("Implement tests") }
+  }
+}

From d4744adcf8df62320a70e27cb89a514bd21be942 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 14 Apr 2022 06:29:32 -0500
Subject: [PATCH 02/13] Additional UTS18 tests

---
 Tests/RegexTests/UTS18Tests.swift | 124 +++++++++++++++++++++++++++++-
 1 file changed, 121 insertions(+), 3 deletions(-)

diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index 97ba156c4..4fc53d83f 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -10,14 +10,14 @@
 //===----------------------------------------------------------------------===//
 
 import XCTest
-@testable import _StringProcessing
+@testable // for internal `matches(of:)`
+import _StringProcessing
 
 class UTS18Tests: XCTestCase {
   var input: String {
     "ABCdefghîøü\u{FFF0} -–—[]123"
   // 012345678901       234567890
   }
-
 }
 
 fileprivate extension String {
@@ -361,6 +361,8 @@ extension UTS18Tests {
     XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#))
     XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#))
     XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#))
+    XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#))
+    XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#))
   }
 
   func testIndividuallyNamedCharacters_XFail() {
@@ -399,6 +401,122 @@ extension UTS18Tests {
   // Standard (or Unicode Technical Standard, respectively), with values that
   // match the Unicode definitions for that version.
   func testFullProperties() {
-    XCTExpectFailure { XCTFail("Implement tests") }
+    // MARK: General
+    // Name (Name_Alias)
+    // Block
+    // Age
+    // General_Category
+    // Script (Script_Extensions)
+    // White_Space
+    // Alphabetic
+    // Hangul_Syllable_Type
+    // Noncharacter_Code_Point
+    // Default_Ignorable_Code_Point
+    // Deprecated
+    // Logical_Order_Exception
+    // Variation_Selector
+
+    // MARK: Numeric
+    // Numeric_Value
+    // Numeric_Type
+    // Hex_Digit
+    // ASCII_Hex_Digit
+
+    // MARK: Identifiers
+    // ID_Continue
+    // ID_Start
+    // XID_Continue
+    // XID_Start
+    // Pattern_Syntax
+    // Pattern_White_Space
+    // Identifier_Status
+    // Identifier_Type
+
+    // MARK: CJK
+    // Ideographic
+    // Unified_Ideograph
+    // Radical
+    // IDS_Binary_Operator
+    // IDS_Trinary_Operator
+    // Equivalent_Unified_Ideograph
+    XCTExpectFailure()
+    XCTFail("Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)")
+    // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#))
+
+    // MARK: Case
+    // Uppercase
+    // Lowercase
+    // Simple_Lowercase_Mapping
+    // Simple_Titlecase_Mapping
+    // Simple_Uppercase_Mapping
+    // Simple_Case_Folding
+    // Soft_Dotted
+    // Cased
+    // Case_Ignorable
+    // Changes_When_Lowercased
+    // Changes_When_Uppercased
+    XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased}/#))
+    XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased=true}/#))
+    XCTAssertFalse("A".contains(#/\p{Changes_When_Uppercased}/#))
+    // Changes_When_Titlecased
+    // Changes_When_Casefolded
+    // Changes_When_Casemapped
+
+    // MARK: Normalization
+    // Canonical_Combining_Class
+    // Decomposition_Type
+    // NFC_Quick_Check
+    // NFKC_Quick_Check
+    // NFD_Quick_Check
+    // NFKD_Quick_Check
+    // NFKC_Casefold
+    // Changes_When_NFKC_Casefolded
+
+    // MARK: Emoji
+    // Emoji
+    // Emoji_Presentation
+    // Emoji_Modifier
+    // Emoji_Modifier_Base
+    // Emoji_Component
+    // Extended_Pictographic
+    // Basic_Emoji*
+    // Emoji_Keycap_Sequence*
+    // RGI_Emoji_Modifier_Sequence*
+    // RGI_Emoji_Flag_Sequence*
+    // RGI_Emoji_Tag_Sequence*
+    // RGI_Emoji_ZWJ_Sequence*
+    // RGI_Emoji*
+
+    // MARK: Shaping and Rendering
+    // Join_Control
+    // Joining_Group
+    // Joining_Type
+    // Vertical_Orientation
+    // Line_Break
+    // Grapheme_Cluster_Break
+    // Sentence_Break
+    // Word_Break
+    // East_Asian_Width
+    // Prepended_Concatenation_Mark
+
+    // MARK: Bidirectional
+    // Bidi_Class
+    // Bidi_Control
+    // Bidi_Mirrored
+    // Bidi_Mirroring_Glyph
+    // Bidi_Paired_Bracket
+    // Bidi_Paired_Bracket_Type
+
+    // MARK: Miscellaneous
+    // Math
+    // Quotation_Mark
+    // Dash
+    // Sentence_Terminal
+    // Terminal_Punctuation
+    // Diacritic
+    // Extender
+    // Grapheme_Base
+    // Grapheme_Extend
+    // Regional_Indicator
   }
 }

From 9a2d6237c3aac41aa84544c477b596620e95121e Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Tue, 19 Apr 2022 12:50:16 -0500
Subject: [PATCH 03/13] Implement canonical equivalence tests

---
 Tests/RegexTests/UTS18Tests.swift | 64 +++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index 4fc53d83f..de13579b2 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -17,6 +17,7 @@ class UTS18Tests: XCTestCase {
   var input: String {
     "ABCdefghîøü\u{FFF0} -–—[]123"
   // 012345678901       234567890
+  // 0         10               20
   }
 }
 
@@ -128,9 +129,9 @@ extension UTS18Tests {
   func testProperties_XFail() {
     XCTExpectFailure("Need to support 'age' and 'block' properties") {
       // XCTAssertFalse("z".contains(#/\p{age=3.1}/#))
-      XCTFail("\(#/\p{age=3.1}/#)")
+      XCTFail(#"\(#/\p{age=3.1}/#)"#)
       // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#))
-      XCTFail("\(#/\p{Block=Greek}/#)")
+      XCTFail(#"\(#/\p{Block=Greek}/#)"#)
     }
   }
   
@@ -196,7 +197,7 @@ extension UTS18Tests {
   // - Nonspacing marks are never divided from their base characters, and
   //   otherwise ignored in locating boundaries.
   func testSimpleWordBoundaries() {
-    let simpleWordRegex = #/.+?\b/#.usingUnicodeWordBoundaries(false)
+    let simpleWordRegex = #/.+?\b/#.wordBoundaryKind(.unicodeLevel1)
     expectFirstMatch(input, simpleWordRegex, input[pos: ..<11])
     expectFirstMatch("don't", simpleWordRegex, "don")
     expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café")
@@ -213,17 +214,17 @@ extension UTS18Tests {
   // conversions, then it shall provide at least the simple, default Unicode
   // case folding.
   func testSimpleLooseMatches() {
-    expectFirstMatch("Dåb", #/Dåb/#.ignoringCase(), "Dåb")
-    expectFirstMatch("dÅB", #/Dåb/#.ignoringCase(), "dÅB")
-    expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoringCase(), "D\u{212B}B")
+    expectFirstMatch("Dåb", #/Dåb/#.ignoresCase(), "Dåb")
+    expectFirstMatch("dÅB", #/Dåb/#.ignoresCase(), "dÅB")
+    expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoresCase(), "D\u{212B}B")
   }
 
   func testSimpleLooseMatches_XFail() {
     XCTExpectFailure("Need case folding support") {
       let sigmas = "σΣς"
-      expectFirstMatch(sigmas, #/σ+/#.ignoringCase(), sigmas[...])
-      expectFirstMatch(sigmas, #/Σ+/#.ignoringCase(), sigmas[...])
-      expectFirstMatch(sigmas, #/ς+/#.ignoringCase(), sigmas[...])
+      expectFirstMatch(sigmas, #/σ+/#.ignoresCase(), sigmas[...])
+      expectFirstMatch(sigmas, #/Σ+/#.ignoresCase(), sigmas[...])
+      expectFirstMatch(sigmas, #/ς+/#.ignoresCase(), sigmas[...])
       
       // TODO: Test German sharp S
       // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]]
@@ -294,7 +295,46 @@ extension UTS18Tests {
   //
   // Specific recommendation?
   func testCanonicalEquivalents() {
-    XCTExpectFailure { XCTFail("Implement tests") }
+    let equivalents = [
+      "\u{006f}\u{031b}\u{0323}",     // o + horn + dot_below
+      "\u{006f}\u{0323}\u{031b}",     // o + dot_below + horn
+      "\u{01a1}\u{0323}",             // o-horn + dot_below
+      "\u{1ecd}\u{031b}",             // o-dot_below + horn
+      "\u{1ee3}",                     // o-horn-dot_below
+    ]
+    
+    let regexes = [
+      #/\u{006f}\u{031b}\u{0323}/#,   // o + horn + dot_below
+      #/\u{006f}\u{0323}\u{031b}/#,   // o + dot_below + horn
+      #/\u{01a1}\u{0323}/#,           // o-horn + dot_below
+      #/\u{1ecd}\u{031b}/#,           // o-dot_below + horn
+      #/\u{1ee3}/#,                   // o-horn-dot_below
+    ]
+
+    // Default: Grapheme cluster semantics
+    for (regexNum, regex) in regexes.enumerated() {
+      for (equivNum, equiv) in equivalents.enumerated() {
+        XCTAssertTrue(
+          equiv.contains(regex),
+          "Grapheme cluster semantics: Regex \(regexNum) didn't match with string \(equivNum)")
+      }
+    }
+    
+    // Unicode scalar semantics
+    for (regexNum, regex) in regexes.enumerated() {
+      for (equivNum, equiv) in equivalents.enumerated() {
+        let regex = regex.matchingSemantics(.unicodeScalar)
+        if regexNum == equivNum {
+          XCTAssertTrue(
+            equiv.contains(regex),
+            "Unicode scalar semantics: Regex \(regexNum) didn't match with string \(equivNum)")
+        } else {
+          XCTAssertFalse(
+            equiv.contains(regex),
+            "Unicode scalar semantics: Regex \(regexNum) incorrectly matched with string \(equivNum)")
+        }
+      }
+    }
   }
   
   // RL2.2 Extended Grapheme Clusters and Character Classes with Strings
@@ -333,7 +373,7 @@ extension UTS18Tests {
   // named characters.
   func testNameProperty_XFail() {
     XCTExpectFailure("Need \\p{name=...} support") {
-      XCTFail("\(#/\p{name=BOM}/#)")
+      XCTFail(#"\(#/\p{name=BOM}/#)"#)
       // Name property
       // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#))
       // Name property and Matching Rules
@@ -440,7 +480,7 @@ extension UTS18Tests {
     // IDS_Trinary_Operator
     // Equivalent_Unified_Ideograph
     XCTExpectFailure()
-    XCTFail("Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)")
+    XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#)
     // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#))
 
     // MARK: Case

From ebd1297cff9e0e9aa3e7eb6d593f53ac5203bb66 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Tue, 19 Apr 2022 12:51:24 -0500
Subject: [PATCH 04/13] Fix canonical equivalence at different levels

This re-interprets runs of characters and scalars as a quoted literal,
and implements the correct semantic level matching for scalars,
characters, and quoted literals.
---
 Sources/_StringProcessing/ByteCodeGen.swift   | 56 ++++++++++++++-----
 .../_StringProcessing/ConsumerInterface.swift |  7 +++
 .../Regex/ASTConversion.swift                 | 12 ++--
 Tests/RegexTests/MatchTests.swift             | 22 ++++----
 4 files changed, 68 insertions(+), 29 deletions(-)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index 86309bb8a..b5350d72a 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen {
   }
   
   mutating func emitCharacter(_ c: Character) throws {
-    // FIXME: Does semantic level matter?
+    // Unicode scalar matches the specific scalars that comprise a character
+    if options.semanticLevel == .unicodeScalar {
+      print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars")
+      for scalar in c.unicodeScalars {
+        try emitScalar(scalar)
+      }
+      return
+    }
+    
     if options.isCaseInsensitive && c.isCased {
       // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
       builder.buildConsume { input, bounds in
@@ -627,22 +635,44 @@ extension Compiler.ByteCodeGen {
       try emitAtom(a)
 
     case let .quotedLiteral(s):
-      // TODO: Should this incorporate options?
-      if options.isCaseInsensitive {
-        // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
-        builder.buildConsume { input, bounds in
-          var iterator = s.makeIterator()
+      if options.semanticLevel == .graphemeCluster {
+        if options.isCaseInsensitive {
+          // TODO: buildCaseInsensitiveMatchSequence(c) or alternative
+          builder.buildConsume { input, bounds in
+            var iterator = s.makeIterator()
+            var currentIndex = bounds.lowerBound
+            while let ch = iterator.next() {
+              guard currentIndex < bounds.upperBound,
+                    ch.lowercased() == input[currentIndex].lowercased()
+              else { return nil }
+              input.formIndex(after: &currentIndex)
+            }
+            return currentIndex
+          }
+        } else {
+          builder.buildMatchSequence(s)
+        }
+      } else {
+        builder.buildConsume {
+          [caseInsensitive = options.isCaseInsensitive] input, bounds in
+          // TODO: Case folding
+          var iterator = s.unicodeScalars.makeIterator()
           var currentIndex = bounds.lowerBound
-          while let ch = iterator.next() {
-            guard currentIndex < bounds.upperBound,
-                  ch.lowercased() == input[currentIndex].lowercased()
-            else { return nil }
-            input.formIndex(after: &currentIndex)
+          while let scalar = iterator.next() {
+            guard currentIndex < bounds.upperBound else { return nil }
+            if caseInsensitive {
+              if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping {
+                return nil
+              }
+            } else {
+              if scalar != input.unicodeScalars[currentIndex] {
+                return nil
+              }
+            }
+            input.unicodeScalars.formIndex(after: &currentIndex)
           }
           return currentIndex
         }
-      } else {
-        builder.buildMatchSequence(s)
       }
 
     case let .regexLiteral(l):
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index f77cd322f..8b54f0527 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -131,6 +131,13 @@ extension AST.Atom {
     }
   }
 
+  var singleScalar: UnicodeScalar? {
+    switch kind {
+    case .scalar(let s): return s
+    default: return nil
+    }
+  }
+
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram<String>.ConsumeFunction? {
diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
index 8acbd3b1b..b423d62ae 100644
--- a/Sources/_StringProcessing/Regex/ASTConversion.swift
+++ b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -65,13 +65,17 @@ extension AST.Node {
             // TODO: For printing, nice to coalesce
             // scalars literals too. We likely need a different
             // approach even before we have a better IR.
-            guard let char = atom?.singleCharacter else {
+            if let char = atom?.singleCharacter  {
+              result.append(char)
+            } else if let scalar = atom?.singleScalar {
+              result.append(Character(scalar))
+            } else {
               break
             }
-            result.append(char)
+            
             astChildren.formIndex(after: &idx)
           }
-          return result.count <= 1 ? nil : (idx, result)
+          return result.isEmpty ? nil : (idx, result)
         }
 
         // No need to nest single children concatenations
@@ -207,7 +211,7 @@ extension AST.Atom {
 
     switch self.kind {
     case let .char(c):                    return .char(c)
-    case let .scalar(s):                  return .scalar(s)
+    case let .scalar(s):                  return .char(Character(s))
     case .any:                            return .any
     case let .backreference(r):           return .backreference(r)
     case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq)
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 4d9ed4d01..c6fb18835 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -938,15 +938,15 @@ extension RegexTests {
 
     // TODO: Oniguruma \y and \Y
     firstMatchTests(
-      #"\u{65}"#,             // Scalar 'e' is present in both:
-      ("Cafe\u{301}", "e"),   // composed and
-      ("Sol Cafe", "e"))      // standalone
+      #"\u{65}"#,             // Scalar 'e' is present in both
+      ("Cafe\u{301}", nil),   // but scalar mode requires boundary at end of match
+      ("Sol Cafe", "e"))      // standalone is okay
     firstMatchTests(
       #"\u{65}\y"#,           // Grapheme boundary assertion
       ("Cafe\u{301}", nil),
       ("Sol Cafe", "e"))
     firstMatchTests(
-      #"\u{65}\Y"#,           // Grapheme non-boundary assertion
+      #"(?u)\u{65}\Y"#,       // Grapheme non-boundary assertion
       ("Cafe\u{301}", "e"),
       ("Sol Cafe", nil))
   }
@@ -1353,11 +1353,10 @@ extension RegexTests {
     // as a character.
 
     firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed)
-    // FIXME: Decomposed character in regex literal doesn't match an equivalent character
-    firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed,
-      xfail: true)
+    firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed)
 
-    firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e")
+    firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e",
+      xfail: true)
     firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil)
     // FIXME: \y is unsupported
     firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil,
@@ -1381,12 +1380,10 @@ extension RegexTests {
       (eComposed, true),
       (eDecomposed, true))
 
-    // FIXME: Decomposed character in regex literal doesn't match an equivalent character
     matchTest(
       #"e\u{301}$"#,
       (eComposed, true),
-      (eDecomposed, true),
-      xfail: true)
+      (eDecomposed, true))
 
     matchTest(
       #"e$"#,
@@ -1472,7 +1469,8 @@ extension RegexTests {
     firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag)
     
     // First Unicode scalar followed by CCC of regional indicators
-    firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag)
+    firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag,
+              xfail: true)
 
     // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character
     // A CCC of regional indicators x 2

From 439d20341a32a6b762a373298527dd392954b5da Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 20 Apr 2022 12:24:56 -0500
Subject: [PATCH 05/13] ....

---
 Sources/_StringProcessing/ByteCodeGen.swift   |  9 ++++++
 .../_CharacterClassModel.swift                | 10 ++++--
 Tests/RegexTests/MatchTests.swift             | 32 ++++++++++++++-----
 Tests/RegexTests/UTS18Tests.swift             |  9 ++++--
 4 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index b5350d72a..fbabcb5b9 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -187,7 +187,16 @@ extension Compiler.ByteCodeGen {
           : nil
       }
     } else {
+      let done = builder.makeAddress()
+      let next = builder.makeAddress()
+      builder.buildSave(next)
+      for scalar in c.unicodeScalars {
+        try emitScalar(scalar)
+      }
+      builder.buildBranch(to: done)
+      builder.label(next)
       builder.buildMatch(c)
+      builder.label(done)      
     }
   }
 
diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift
index c9762f00e..bd36a6cde 100644
--- a/Sources/_StringProcessing/_CharacterClassModel.swift
+++ b/Sources/_StringProcessing/_CharacterClassModel.swift
@@ -177,10 +177,14 @@ public struct _CharacterClassModel: Hashable {
       return matched ? str.index(after: i) : nil
     case .unicodeScalar:
       let c = str.unicodeScalars[i]
+      var nextIndex = str.unicodeScalars.index(after: i)
       var matched: Bool
       switch cc {
-      case .any: matched = true
-      case .anyGrapheme: fatalError("Not matched in this mode")
+      case .any:
+        matched = true
+      case .anyGrapheme:
+        matched = true
+        nextIndex = str.index(after: i)
       case .digit:
         matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits)
       case .hexDigit:
@@ -197,7 +201,7 @@ public struct _CharacterClassModel: Hashable {
       if isInverted {
         matched.toggle()
       }
-      return matched ? str.unicodeScalars.index(after: i) : nil
+      return matched ? nextIndex : nil
     }
   }
 }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index c6fb18835..d07646077 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -400,7 +400,8 @@ extension RegexTests {
       "a++a",
       ("babc", nil),
       ("baaabc", nil),
-      ("bb", nil))
+      ("bb", nil),
+      xfail: true)
     firstMatchTests(
       "a+?a",
       ("babc", nil),
@@ -462,15 +463,11 @@ extension RegexTests {
       "a{2,4}+a",
       ("babc", nil),
       ("baabc", nil),
-      ("baaabc", nil),
       ("baaaaabc", "aaaaa"),
       ("baaaaaaaabc", "aaaaa"),
       ("bb", nil))
     firstMatchTests(
       "a{,4}+a",
-      ("babc", nil),
-      ("baabc", nil),
-      ("baaabc", nil),
       ("baaaaabc", "aaaaa"),
       ("baaaaaaaabc", "aaaaa"),
       ("bb", nil))
@@ -478,11 +475,25 @@ extension RegexTests {
       "a{2,}+a",
       ("babc", nil),
       ("baabc", nil),
+      ("bb", nil))
+    
+    // XFAIL'd versions of the above
+    firstMatchTests(
+      "a{2,4}+a",
+      ("baaabc", nil),
+      xfail: true)
+    firstMatchTests(
+      "a{,4}+a",
+      ("babc", nil),
+      ("baabc", nil),
+      ("baaabc", nil),
+      xfail: true)
+    firstMatchTests(
+      "a{2,}+a",
       ("baaabc", nil),
       ("baaaaabc", nil),
       ("baaaaaaaabc", nil),
-      ("bb", nil))
-
+      xfail: true)
 
     firstMatchTests(
       "(?:a{2,4}?b)+",
@@ -940,7 +951,11 @@ extension RegexTests {
     firstMatchTests(
       #"\u{65}"#,             // Scalar 'e' is present in both
       ("Cafe\u{301}", nil),   // but scalar mode requires boundary at end of match
+      xfail: true)
+    firstMatchTests(
+      #"\u{65}"#,             // Scalar 'e' is present in both
       ("Sol Cafe", "e"))      // standalone is okay
+
     firstMatchTests(
       #"\u{65}\y"#,           // Grapheme boundary assertion
       ("Cafe\u{301}", nil),
@@ -1355,7 +1370,8 @@ extension RegexTests {
     firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed)
     firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed)
 
-    firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e",
+    // FIXME: Implicit \y at end of match
+    firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil,
       xfail: true)
     firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil)
     // FIXME: \y is unsupported
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index de13579b2..733126bcf 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -343,11 +343,16 @@ extension UTS18Tests {
   // matching against an arbitrary extended grapheme cluster, Character Classes
   // with Strings, and extended grapheme cluster boundaries.
   func testExtendedGraphemeClusters() {
-    XCTExpectFailure { XCTFail("Implement tests") }
+    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.$/#))
+    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#))
+    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#.matchingSemantics(.unicodeScalar)))
+    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.+\y/#.matchingSemantics(.unicodeScalar)))
   }
   
   func testCharacterClassesWithStrings() {
-    XCTExpectFailure { XCTFail("Implement tests") }
+    let regex = #/[a-z🧐🇧🇪🇧🇫🇧🇬]/#
+    XCTAssertTrue("🧐".contains(regex))
+    XCTAssertTrue("🇧🇫".contains(regex))
   }
   
   // RL2.3 Default Word Boundaries

From dfd917b20d7fcb6dd32d44b8ecaf537c5a7aaeae Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Wed, 20 Apr 2022 13:20:44 -0500
Subject: [PATCH 06/13] Document possessive quantification issues

---
 Tests/RegexTests/MatchTests.swift | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index c6fb18835..e50334c04 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -169,6 +169,8 @@ func firstMatchTest(
       XCTAssertEqual(found, match, file: file, line: line)
     }
   } catch {
+    // FIXME: This allows non-matches to succeed even when xfail'd
+    // When xfail == true, this should report failure for match == nil
     if !xfail && match != nil {
       XCTFail("\(error)", file: file, line: line)
     }
@@ -182,7 +184,9 @@ func firstMatchTests(
   syntax: SyntaxOptions = .traditional,
   enableTracing: Bool = false,
   dumpAST: Bool = false,
-  xfail: Bool = false
+  xfail: Bool = false,
+  file: StaticString = #filePath,
+  line: UInt = #line
 ) {
   for (input, match) in tests {
     firstMatchTest(
@@ -192,7 +196,9 @@ func firstMatchTests(
       syntax: syntax,
       enableTracing: enableTracing,
       dumpAST: dumpAST,
-      xfail: xfail)
+      xfail: xfail,
+      file: file,
+      line: line)
   }
 }
 
@@ -483,6 +489,24 @@ extension RegexTests {
       ("baaaaaaaabc", nil),
       ("bb", nil))
 
+    // XFAIL'd possessive tests
+    firstMatchTests(
+      "a?+a",
+      ("a", nil),
+      xfail: true)
+    firstMatchTests(
+      "(a|a)?+a",
+      ("a", nil),
+      xfail: true)
+    firstMatchTests(
+      "(a|a){2,4}+a",
+      ("a", nil),
+      ("aa", nil))
+    firstMatchTests(
+      "(a|a){2,4}+a",
+      ("aaa", nil),
+      ("aaaa", nil),
+      xfail: true)
 
     firstMatchTests(
       "(?:a{2,4}?b)+",

From 5adaf13dac6896fd1d525fcceebdf76a5fcaf274 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 21 Apr 2022 10:07:30 -0500
Subject: [PATCH 07/13] Test named chars x semantic level

---
 Tests/RegexTests/UTS18Tests.swift | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index de13579b2..485eedb72 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -403,6 +403,10 @@ extension UTS18Tests {
     XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#))
     XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#))
     XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#))
+    
+    // Matching semantic level
+    XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#))
+    XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#))
   }
 
   func testIndividuallyNamedCharacters_XFail() {

From f83d422d0b6ee243fefbfe9ca932641f6223b51c Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 11:15:46 -0500
Subject: [PATCH 08/13] Enable loose matching on \N{...} scalar names

---
 .../_StringProcessing/ConsumerInterface.swift | 42 +++++++++++++++++--
 Tests/RegexTests/UTS18Tests.swift             | 10 +++--
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 72e914b69..470050502 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -111,6 +111,38 @@ extension DSLTree.Atom {
   }
 }
 
+extension String {
+  /// Compares this string to `other` using the loose matching rule UAX44-LM2,
+  /// which ignores case, whitespace, underscores, and nearly all medial
+  /// hyphens.
+  ///
+  /// FIXME: Only ignore medial hyphens
+  /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E
+  /// See https://www.unicode.org/reports/tr44/#Matching_Rules
+  fileprivate func isEqualByUAX44LM2(to other: String) -> Bool {
+    var i = startIndex
+    var j = other.startIndex
+    
+    while i < endIndex {
+      if self[i].isWhitespace || self[i] == "-" || self[i] == "_" {
+        formIndex(after: &i)
+        continue
+      }
+      if other[j].isWhitespace || other[j] == "-" || other[j] == "_" {
+        other.formIndex(after: &j)
+        continue
+      }
+      
+      if self[i] != other[j] && self[i].lowercased() != other[j].lowercased() {
+        return false
+      }
+
+      formIndex(after: &i)
+      other.formIndex(after: &j)
+    }
+    return i == endIndex && j == other.endIndex
+  }
+}
 
 // TODO: This is basically an AST interpreter, which would
 // be good or interesting to build regardless, and serves
@@ -174,10 +206,12 @@ extension AST.Atom {
       return try p.generateConsumer(opts)
 
     case let .namedCharacter(name):
-      return consumeScalarProp {
-        // TODO: alias? casing?
-        $0.name == name || $0.nameAlias == name
-      }
+      return consumeScalar(propertyScalarPredicate {
+        // FIXME: name aliases not covered by $0.nameAlias are missed
+        // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM'
+        $0.name?.isEqualByUAX44LM2(to: name) == true
+          || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true
+      })
       
     case .any:
       assertionFailure(
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index 9e96138b1..d76329670 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -408,7 +408,13 @@ extension UTS18Tests {
     XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#))
     XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#))
     XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#))
-    
+
+    // Loose matching
+    XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#))
+    XCTAssertTrue("\u{263A}".contains(#/\N{wHiTe_sMiLiNg_fAcE}/#))
+    XCTAssertTrue("\u{263A}".contains(#/\N{White Smiling-Face}/#))
+    XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#))
+
     // Matching semantic level
     XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#))
     XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#))
@@ -421,9 +427,7 @@ extension UTS18Tests {
     }
     
     XCTExpectFailure("Other named char failures -- investigate") {
-      XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#))
       XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#))
-      XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#))
       XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#))
       XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#))
       XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#))

From 1282c706bb29e8affc7e45c60be7b07c22c964fb Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 11:31:38 -0500
Subject: [PATCH 09/13] Make Unicode property classes work with semantics

---
 .../_StringProcessing/ConsumerInterface.swift | 323 ++++++++++--------
 .../Unicode/CharacterProps.swift              |   6 +
 Tests/RegexTests/MatchTests.swift             |   8 +-
 Tests/RegexTests/UTS18Tests.swift             |   6 +-
 4 files changed, 187 insertions(+), 156 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 470050502..7331897f1 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -353,8 +353,9 @@ extension DSLTree.CustomCharacterClass {
         }
       }
       if isInverted {
-        // FIXME: semantic level
-        return input.index(after: bounds.lowerBound)
+        return opts.semanticLevel == .graphemeCluster
+          ? input.index(after: bounds.lowerBound)
+          : input.unicodeScalars.index(after: bounds.lowerBound)
       }
       return nil
     }
@@ -362,38 +363,26 @@ extension DSLTree.CustomCharacterClass {
 }
 
 // NOTE: Conveniences, though not most performant
-private func consumeScalarScript(
-  _ s: Unicode.Script
-) -> MEProgram<String>.ConsumeFunction {
-  consumeScalar {
-    Unicode.Script($0) == s
-  }
+typealias ScalarPredicate = (UnicodeScalar) -> Bool
+
+private func scriptScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate {
+  { Unicode.Script($0) == s }
 }
-private func consumeScalarScriptExtension(
-  _ s: Unicode.Script
-) -> MEProgram<String>.ConsumeFunction {
-  consumeScalar {
-    let extensions = Unicode.Script.extensions(for: $0)
-    return extensions.contains(s)
-  }
+private func scriptExtensionScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate {
+  { Unicode.Script.extensions(for: $0).contains(s) }
 }
-private func consumeScalarGC(
-  _ gc: Unicode.GeneralCategory
-) -> MEProgram<String>.ConsumeFunction {
-  consumeScalar { gc == $0.properties.generalCategory }
+private func categoryScalarPredicate(_ gc: Unicode.GeneralCategory) -> ScalarPredicate {
+  { gc == $0.properties.generalCategory }
 }
-private func consumeScalarGCs(
-  _ gcs: [Unicode.GeneralCategory]
-) -> MEProgram<String>.ConsumeFunction {
-  consumeScalar { gcs.contains($0.properties.generalCategory) }
+private func categoriesScalarPredicate(_ gcs: [Unicode.GeneralCategory]) -> ScalarPredicate {
+  { gcs.contains($0.properties.generalCategory) }
 }
-private func consumeScalarProp(
-  _ p: @escaping (Unicode.Scalar.Properties) -> Bool
-) -> MEProgram<String>.ConsumeFunction {
-  consumeScalar { p($0.properties) }
+private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) -> Bool) -> ScalarPredicate {
+  { p($0.properties) }
 }
+
 func consumeScalar(
-  _ p: @escaping (Unicode.Scalar) -> Bool
+  _ p: @escaping ScalarPredicate
 ) -> MEProgram<String>.ConsumeFunction {
   { input, bounds in
     // TODO: bounds check?
@@ -405,6 +394,37 @@ func consumeScalar(
     return nil
   }
 }
+func consumeCharacterWithLeadingScalar(
+  _ p: @escaping ScalarPredicate
+) -> MEProgram<String>.ConsumeFunction {
+  { input, bounds in
+    let curIdx = bounds.lowerBound
+    if p(input[curIdx].unicodeScalars.first!) {
+      return input.index(after: curIdx)
+    }
+    return nil
+  }
+}
+func consumeCharacterWithSingleScalar(
+  _ p: @escaping ScalarPredicate
+) -> MEProgram<String>.ConsumeFunction {
+  { input, bounds in
+    let curIdx = bounds.lowerBound
+    
+    if input[curIdx].hasExactlyOneScalar && p(input[curIdx].unicodeScalars.first!) {
+      return input.index(after: curIdx)
+    }
+    return nil
+  }
+}
+
+func consumeFunction(
+  for opts: MatchingOptions
+) -> (@escaping ScalarPredicate) -> MEProgram<String>.ConsumeFunction {
+  opts.semanticLevel == .graphemeCluster
+    ? consumeCharacterWithLeadingScalar
+    : consumeScalar
+}
 
 extension AST.Atom.CharacterProperty {
   func generateConsumer(
@@ -416,16 +436,15 @@ extension AST.Atom.CharacterProperty {
     ) -> MEProgram<String>.ConsumeFunction {
       return { input, bounds in
         if p(input, bounds) != nil { return nil }
-        // TODO: semantic level
+
         // TODO: bounds check
-        return input.unicodeScalars.index(
-          after: bounds.lowerBound)
+        return opts.semanticLevel == .graphemeCluster
+          ? input.index(after: bounds.lowerBound)
+          : input.unicodeScalars.index(after: bounds.lowerBound)
       }
     }
 
-    // FIXME: Below is largely scalar based, for convenience,
-    // but we want a comprehensive treatment to semantic mode
-    // switching.
+    let consume = consumeFunction(for: opts)
     let preInversion: MEProgram<String>.ConsumeFunction =
     try {
       switch kind {
@@ -436,11 +455,16 @@ extension AST.Atom.CharacterProperty {
           return input.index(after: bounds.lowerBound)
         }
       case .assigned:
-        return consumeScalar {
+        return consume {
           $0.properties.generalCategory != .unassigned
         }
       case .ascii:
-        return consumeScalar(\.isASCII)
+        // Note: ASCII must look at the whole character, not just the first
+        // scalar. That is, "e\u{301}" is not an ASCII character, even though
+        // the first scalar is.
+        return opts.semanticLevel == .graphemeCluster
+          ? consumeCharacterWithSingleScalar(\.isASCII)
+          : consumeScalar(\.isASCII)
 
       case .generalCategory(let p):
         return try p.generateConsumer(opts)
@@ -451,10 +475,10 @@ extension AST.Atom.CharacterProperty {
         return value ? cons : invert(cons)
 
       case .script(let s):
-        return consumeScalarScript(s)
+        return consume(scriptScalarPredicate(s))
 
       case .scriptExtension(let s):
-        return consumeScalarScriptExtension(s)
+        return consume(scriptExtensionScalarPredicate(s))
 
       case .posix(let p):
         return p.generateConsumer(opts)
@@ -477,49 +501,48 @@ extension Unicode.BinaryProperty {
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram<String>.ConsumeFunction {
+    let consume = consumeFunction(for: opts)
+    
     switch self {
-
     case .asciiHexDigit:
-      return consumeScalarProp {
+      return consume(propertyScalarPredicate {
         $0.isHexDigit && $0.isASCIIHexDigit
-      }
+      })
     case .alphabetic:
-      return consumeScalarProp(\.isAlphabetic)
+      return consume(propertyScalarPredicate(\.isAlphabetic))
     case .bidiControl:
       break
-
-
-    case .bidiMirrored: 
-      return consumeScalarProp(\.isBidiMirrored)
+    case .bidiMirrored:
+      return consume(propertyScalarPredicate(\.isBidiMirrored))
     case .cased:
-      return consumeScalarProp(\.isCased)
+      return consume(propertyScalarPredicate(\.isCased))
     case .compositionExclusion:
       break
     case .caseIgnorable:
-      return consumeScalarProp(\.isCaseIgnorable)
+      return consume(propertyScalarPredicate(\.isCaseIgnorable))
     case .changesWhenCasefolded:
-      return consumeScalarProp(\.changesWhenCaseFolded)
+      return consume(propertyScalarPredicate(\.changesWhenCaseFolded))
     case .changesWhenCasemapped:
-      return consumeScalarProp(\.changesWhenCaseMapped)
+      return consume(propertyScalarPredicate(\.changesWhenCaseMapped))
     case .changesWhenNFKCCasefolded:
-      return consumeScalarProp(\.changesWhenNFKCCaseFolded)
+      return consume(propertyScalarPredicate(\.changesWhenNFKCCaseFolded))
     case .changesWhenLowercased:
-      return consumeScalarProp(\.changesWhenLowercased)
+      return consume(propertyScalarPredicate(\.changesWhenLowercased))
     case .changesWhenTitlecased:
-      return consumeScalarProp(\.changesWhenTitlecased)
+      return consume(propertyScalarPredicate(\.changesWhenTitlecased))
     case .changesWhenUppercased:
-      return consumeScalarProp(\.changesWhenUppercased)
+      return consume(propertyScalarPredicate(\.changesWhenUppercased))
     case .dash:
-      return consumeScalarProp(\.isDash)
+      return consume(propertyScalarPredicate(\.isDash))
     case .deprecated:
-      return consumeScalarProp(\.isDeprecated)
+      return consume(propertyScalarPredicate(\.isDeprecated))
     case .defaultIgnorableCodePoint:
-      return consumeScalarProp(\.isDefaultIgnorableCodePoint)
+      return consume(propertyScalarPredicate(\.isDefaultIgnorableCodePoint))
     case .diacratic: // spelling?
-      return consumeScalarProp(\.isDiacritic)
+      return consume(propertyScalarPredicate(\.isDiacritic))
     case .emojiModifierBase:
       if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) {
-        return consumeScalarProp(\.isEmojiModifierBase)
+        return consume(propertyScalarPredicate(\.isEmojiModifierBase))
       } else {
         throw Unsupported(
           "isEmojiModifierBase on old OSes")
@@ -528,59 +551,59 @@ extension Unicode.BinaryProperty {
       break
     case .emojiModifier:
       if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) {
-        return consumeScalarProp(\.isEmojiModifier)
+        return consume(propertyScalarPredicate(\.isEmojiModifier))
       } else {
         throw Unsupported("isEmojiModifier on old OSes")
       }
     case .emoji:
       if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) {
-        return consumeScalarProp(\.isEmoji)
+        return consume(propertyScalarPredicate(\.isEmoji))
       } else {
         throw Unsupported("isEmoji on old OSes")
       }
     case .emojiPresentation:
       if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) {
-        return consumeScalarProp(\.isEmojiPresentation)
+        return consume(propertyScalarPredicate(\.isEmojiPresentation))
       } else {
         throw Unsupported(
           "isEmojiPresentation on old OSes")
       }
     case .extender:
-      return consumeScalarProp(\.isExtender)
+      return consume(propertyScalarPredicate(\.isExtender))
     case .extendedPictographic:
       break // NOTE: Stdlib has this data internally
     case .fullCompositionExclusion:
-      return consumeScalarProp(\.isFullCompositionExclusion)
+      return consume(propertyScalarPredicate(\.isFullCompositionExclusion))
     case .graphemeBase:
-      return consumeScalarProp(\.isGraphemeBase)
+      return consume(propertyScalarPredicate(\.isGraphemeBase))
     case .graphemeExtended:
-      return consumeScalarProp(\.isGraphemeExtend)
+      return consume(propertyScalarPredicate(\.isGraphemeExtend))
     case .graphemeLink:
       break
     case .hexDigit:
-      return consumeScalarProp(\.isHexDigit)
+      return consume(propertyScalarPredicate(\.isHexDigit))
     case .hyphen:
       break
     case .idContinue:
-      return consumeScalarProp(\.isIDContinue)
+      return consume(propertyScalarPredicate(\.isIDContinue))
     case .ideographic:
-      return consumeScalarProp(\.isIdeographic)
+      return consume(propertyScalarPredicate(\.isIdeographic))
     case .idStart:
-      return consumeScalarProp(\.isIDStart)
+      return consume(propertyScalarPredicate(\.isIDStart))
     case .idsBinaryOperator:
-      return consumeScalarProp(\.isIDSBinaryOperator)
+      return consume(propertyScalarPredicate(\.isIDSBinaryOperator))
     case .idsTrinaryOperator:
-      return consumeScalarProp(\.isIDSTrinaryOperator)
+      return consume(propertyScalarPredicate(\.isIDSTrinaryOperator))
     case .joinControl:
-      return consumeScalarProp(\.isJoinControl)
+      return consume(propertyScalarPredicate(\.isJoinControl))
     case .logicalOrderException:
-      return consumeScalarProp(\.isLogicalOrderException)
+      return consume(propertyScalarPredicate(\.isLogicalOrderException))
     case .lowercase:
-      return consumeScalarProp(\.isLowercase)
+      return consume(propertyScalarPredicate(\.isLowercase))
     case .math:
-      return consumeScalarProp(\.isMath)
+      return consume(propertyScalarPredicate(\.isMath))
     case .noncharacterCodePoint:
-      return consumeScalarProp(\.isNoncharacterCodePoint)
+      return consume(propertyScalarPredicate(\.isNoncharacterCodePoint))
     case .otherAlphabetic:
       break
     case .otherDefaultIgnorableCodePoint:
@@ -598,37 +621,37 @@ extension Unicode.BinaryProperty {
     case .otherUppercase:
       break
     case .patternSyntax:
-      return consumeScalarProp(\.isPatternSyntax)
+      return consume(propertyScalarPredicate(\.isPatternSyntax))
     case .patternWhitespace:
-      return consumeScalarProp(\.isPatternWhitespace)
+      return consume(propertyScalarPredicate(\.isPatternWhitespace))
     case .prependedConcatenationMark:
       break
     case .quotationMark:
-      return consumeScalarProp(\.isQuotationMark)
+      return consume(propertyScalarPredicate(\.isQuotationMark))
     case .radical:
-      return consumeScalarProp(\.isRadical)
+      return consume(propertyScalarPredicate(\.isRadical))
     case .regionalIndicator:
-      return consumeScalar { s in
+      return consume { s in
         (0x1F1E6...0x1F1FF).contains(s.value)
       }
     case .softDotted:
-      return consumeScalarProp(\.isSoftDotted)
+      return consume(propertyScalarPredicate(\.isSoftDotted))
     case .sentenceTerminal:
-      return consumeScalarProp(\.isSentenceTerminal)
+      return consume(propertyScalarPredicate(\.isSentenceTerminal))
     case .terminalPunctuation:
-      return consumeScalarProp(\.isTerminalPunctuation)
+      return consume(propertyScalarPredicate(\.isTerminalPunctuation))
     case .unifiedIdiograph: // spelling?
-      return consumeScalarProp(\.isUnifiedIdeograph)
+      return consume(propertyScalarPredicate(\.isUnifiedIdeograph))
     case .uppercase:
-      return consumeScalarProp(\.isUppercase)
+      return consume(propertyScalarPredicate(\.isUppercase))
     case .variationSelector:
-      return consumeScalarProp(\.isVariationSelector)
+      return consume(propertyScalarPredicate(\.isVariationSelector))
     case .whitespace:
-      return consumeScalarProp(\.isWhitespace)
+      return consume(propertyScalarPredicate(\.isWhitespace))
     case .xidContinue:
-      return consumeScalarProp(\.isXIDContinue)
+      return consume(propertyScalarPredicate(\.isXIDContinue))
     case .xidStart:
-      return consumeScalarProp(\.isXIDStart)
+      return consume(propertyScalarPredicate(\.isXIDStart))
     case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD,
         .expandsOnNFKC:
       throw Unsupported("Unicode-deprecated: \(self)")
@@ -643,42 +666,44 @@ extension Unicode.POSIXProperty {
   func generateConsumer(
     _ opts: MatchingOptions
   ) -> MEProgram<String>.ConsumeFunction {
-    // FIXME: semantic levels, modes, etc
+    let consume = consumeFunction(for: opts)
+
+    // FIXME: modes, etc
     switch self {
     case .alnum:
-      return consumeScalarProp {
+      return consume(propertyScalarPredicate {
         $0.isAlphabetic || $0.numericType != nil
-      }
+      })
     case .blank:
-      return consumeScalar { s in
+      return consume { s in
         s.properties.generalCategory == .spaceSeparator ||
         s == "\t"
       }
 
     case .graph:
-      return consumeScalarProp { p in
+      return consume(propertyScalarPredicate { p in
         !(
           p.isWhitespace ||
           p.generalCategory == .control ||
           p.generalCategory == .surrogate ||
           p.generalCategory == .unassigned
         )
-      }
+      })
     case .print:
-      return consumeScalarProp { p in
+      return consume(propertyScalarPredicate { p in
         // FIXME: better def
         p.generalCategory != .control
-      }
+      })
     case .word:
-      return consumeScalarProp { p in
+      return consume(propertyScalarPredicate { p in
         // FIXME: better def
         p.isAlphabetic || p.numericType != nil
         || p.isJoinControl
         || p.isDash// marks and connectors...
-      }
+      })
 
     case .xdigit:
-      return consumeScalarProp(\.isHexDigit) // or number
+      return consume(propertyScalarPredicate(\.isHexDigit)) // or number
 
     }
   }
@@ -689,113 +714,115 @@ extension Unicode.ExtendedGeneralCategory {
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram<String>.ConsumeFunction {
+    let consume = consumeFunction(for: opts)
+
     switch self {
     case .letter:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .uppercaseLetter, .lowercaseLetter,
         .titlecaseLetter, .modifierLetter,
         .otherLetter
-      ])
+      ]))
 
     case .mark:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .nonspacingMark, .spacingMark, .enclosingMark
-      ])
+      ]))
 
     case .number:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .decimalNumber, .letterNumber, .otherNumber
-      ])
+      ]))
 
     case .symbol:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .mathSymbol, .currencySymbol, .modifierSymbol,
         .otherSymbol
-      ])
+      ]))
 
     case .punctuation:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .connectorPunctuation, .dashPunctuation,
         .openPunctuation, .closePunctuation,
         .initialPunctuation, .finalPunctuation,
         .otherPunctuation
-      ])
+      ]))
 
     case .separator:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .spaceSeparator, .lineSeparator, .paragraphSeparator
-      ])
+      ]))
 
     case .other:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .control, .format, .surrogate, .privateUse, .unassigned
-      ])
+      ]))
 
     case .casedLetter:
-      return consumeScalarGCs([
+      return consume(categoriesScalarPredicate([
         .uppercaseLetter, .lowercaseLetter, .titlecaseLetter
-      ])
+      ]))
 
     case .control:
-      return consumeScalarGC(.control)
+      return consume(categoryScalarPredicate(.control))
     case .format:
-      return consumeScalarGC(.format)
+      return consume(categoryScalarPredicate(.format))
     case .unassigned:
-      return consumeScalarGC(.unassigned)
+      return consume(categoryScalarPredicate(.unassigned))
     case .privateUse:
-      return consumeScalarGC(.privateUse)
+      return consume(categoryScalarPredicate(.privateUse))
     case .surrogate:
-      return consumeScalarGC(.surrogate)
+      return consume(categoryScalarPredicate(.surrogate))
     case .lowercaseLetter:
-      return consumeScalarGC(.lowercaseLetter)
+      return consume(categoryScalarPredicate(.lowercaseLetter))
     case .modifierLetter:
-      return consumeScalarGC(.modifierLetter)
+      return consume(categoryScalarPredicate(.modifierLetter))
     case .otherLetter:
-      return consumeScalarGC(.otherLetter)
+      return consume(categoryScalarPredicate(.otherLetter))
     case .titlecaseLetter:
-      return consumeScalarGC(.titlecaseLetter)
+      return consume(categoryScalarPredicate(.titlecaseLetter))
     case .uppercaseLetter:
-      return consumeScalarGC(.uppercaseLetter)
+      return consume(categoryScalarPredicate(.uppercaseLetter))
     case .spacingMark:
-      return consumeScalarGC(.spacingMark)
+      return consume(categoryScalarPredicate(.spacingMark))
     case .enclosingMark:
-      return consumeScalarGC(.enclosingMark)
+      return consume(categoryScalarPredicate(.enclosingMark))
     case .nonspacingMark:
-      return consumeScalarGC(.nonspacingMark)
+      return consume(categoryScalarPredicate(.nonspacingMark))
     case .decimalNumber:
-      return consumeScalarGC(.decimalNumber)
+      return consume(categoryScalarPredicate(.decimalNumber))
     case .letterNumber:
-      return consumeScalarGC(.letterNumber)
+      return consume(categoryScalarPredicate(.letterNumber))
     case .otherNumber:
-      return consumeScalarGC(.otherNumber)
+      return consume(categoryScalarPredicate(.otherNumber))
     case .connectorPunctuation:
-      return consumeScalarGC(.connectorPunctuation)
+      return consume(categoryScalarPredicate(.connectorPunctuation))
     case .dashPunctuation:
-      return consumeScalarGC(.dashPunctuation)
+      return consume(categoryScalarPredicate(.dashPunctuation))
     case .closePunctuation:
-      return consumeScalarGC(.closePunctuation)
+      return consume(categoryScalarPredicate(.closePunctuation))
     case .finalPunctuation:
-      return consumeScalarGC(.finalPunctuation)
+      return consume(categoryScalarPredicate(.finalPunctuation))
     case .initialPunctuation:
-      return consumeScalarGC(.initialPunctuation)
+      return consume(categoryScalarPredicate(.initialPunctuation))
     case .otherPunctuation:
-      return consumeScalarGC(.otherPunctuation)
+      return consume(categoryScalarPredicate(.otherPunctuation))
     case .openPunctuation:
-      return consumeScalarGC(.openPunctuation)
+      return consume(categoryScalarPredicate(.openPunctuation))
     case .currencySymbol:
-      return consumeScalarGC(.currencySymbol)
+      return consume(categoryScalarPredicate(.currencySymbol))
     case .modifierSymbol:
-      return consumeScalarGC(.modifierSymbol)
+      return consume(categoryScalarPredicate(.modifierSymbol))
     case .mathSymbol:
-      return consumeScalarGC(.mathSymbol)
+      return consume(categoryScalarPredicate(.mathSymbol))
     case .otherSymbol:
-      return consumeScalarGC(.otherSymbol)
+      return consume(categoryScalarPredicate(.otherSymbol))
     case .lineSeparator:
-      return consumeScalarGC(.lineSeparator)
+      return consume(categoryScalarPredicate(.lineSeparator))
     case .paragraphSeparator:
-      return consumeScalarGC(.paragraphSeparator)
+      return consume(categoryScalarPredicate(.paragraphSeparator))
     case .spaceSeparator:
-      return consumeScalarGC(.spaceSeparator)
+      return consume(categoryScalarPredicate(.spaceSeparator))
     }
   }
 }
diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift
index cfa68c425..80f6819a6 100644
--- a/Sources/_StringProcessing/Unicode/CharacterProps.swift
+++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift
@@ -12,3 +12,9 @@
 
 // TODO
 
+extension Character {
+  /// Whether this character is made up of exactly one Unicode scalar value.
+  var hasExactlyOneScalar: Bool {
+    unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
+  }
+}
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index f999f2fe5..4bf2da106 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -499,7 +499,7 @@ extension RegexTests {
       ("baaabc", nil),
       ("baaaaabc", nil),
       ("baaaaaaaabc", nil),
-      ("bb", nil))
+      xfail: true)
 
     // XFAIL'd possessive tests
     firstMatchTests(
@@ -1454,8 +1454,7 @@ extension RegexTests {
     // \p{Letter}
     firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed)
     // FIXME: \p{Letter} doesn't match a decomposed character
-    firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed,
-              xfail: true)
+    firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed)
     
     // \d
     firstMatchTest(#"\d"#, input: "5", match: "5")
@@ -1560,8 +1559,7 @@ extension RegexTests {
     
     // FIXME: \O is unsupported
     firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed)
-    firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed,
-      xfail: true)
+    firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed)
     firstMatchTest(#"\O"#, input: eComposed, match: eComposed)
     firstMatchTest(#"\O"#, input: eDecomposed, match: nil,
               xfail: true)
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index d76329670..95f820bc1 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -15,9 +15,9 @@ import _StringProcessing
 
 class UTS18Tests: XCTestCase {
   var input: String {
-    "ABCdefghîøü\u{FFF0} -–—[]123"
-  // 012345678901       234567890
-  // 0         10               20
+    "ABCdefghîøu\u{308}\u{FFF0} -–—[]123"
+  // 01234567890       1       234567890
+  // 0                10               20
   }
 }
 

From 2dec7fd06ef00251159e91b94979fa0d1b9e84e3 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 11:54:39 -0500
Subject: [PATCH 10/13] Fix up an expected failure block

---
 Tests/RegexTests/UTS18Tests.swift | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index 95f820bc1..0b20648ad 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -492,9 +492,10 @@ extension UTS18Tests {
     // IDS_Binary_Operator
     // IDS_Trinary_Operator
     // Equivalent_Unified_Ideograph
-    XCTExpectFailure()
-    XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#)
-    // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#))
+    XCTExpectFailure {
+      XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#)
+      // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#))
+    }
 
     // MARK: Case
     // Uppercase

From fb1324bbc7a3b167e0b097701e514271400d23d7 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 14:19:01 -0500
Subject: [PATCH 11/13] Remove regex literals from UTS-18 tests

---
 Package.swift                     |   1 -
 Tests/RegexTests/UTS18Tests.swift | 221 ++++++++++++++++--------------
 2 files changed, 117 insertions(+), 105 deletions(-)

diff --git a/Package.swift b/Package.swift
index 5e90cba7f..8303fc5cb 100644
--- a/Package.swift
+++ b/Package.swift
@@ -67,7 +67,6 @@ let package = Package(
             name: "RegexTests",
             dependencies: ["_StringProcessing"],
             swiftSettings: [
-                .unsafeFlags(["-Xfrontend", "-enable-experimental-string-processing"]),
                 .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
             ]),
         .testTarget(
diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift
index 0b20648ad..71f459a1b 100644
--- a/Tests/RegexTests/UTS18Tests.swift
+++ b/Tests/RegexTests/UTS18Tests.swift
@@ -9,6 +9,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+// This test suite includes tests that verify the behavior of `Regex` as it
+// relates to Unicode Technical Standard #18: Unicode Regular Expressions.
+//
+// Please note: Quotations of UTS18 in this file mostly use 'Character' to mean
+// Unicode code point, and 'String' to mean 'sequence of code points' — they
+// are not the Swift meanings of those terms.
+//
+// See https://unicode.org/reports/tr18/ for more.
+
 import XCTest
 @testable // for internal `matches(of:)`
 import _StringProcessing
@@ -21,6 +30,10 @@ class UTS18Tests: XCTestCase {
   }
 }
 
+fileprivate func regex(_ pattern: String) -> Regex<Substring> {
+  try! Regex(pattern, as: Substring.self)
+}
+
 fileprivate extension String {
   subscript<R: RangeExpression>(pos bounds: R) -> Substring
     where R.Bound == Int
@@ -54,9 +67,9 @@ extension UTS18Tests {
   // To meet this requirement, an implementation shall supply a mechanism for
   // specifying any Unicode code point (from U+0000 to U+10FFFF), using the
   // hexadecimal code point representation.
-  func testHexNotation() throws {
-    expectFirstMatch("ab", #/\u{61}\u{62}/#, "ab")
-    expectFirstMatch("𝄞", #/\u{1D11E}/#, "𝄞")
+  func testHexNotation() {
+    expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab")
+    expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞")
   }
   
   // 1.1.1 Hex Notation and Normalization
@@ -80,50 +93,50 @@ extension UTS18Tests {
   // Binary, Enumerated, Catalog, and Name values must follow the Matching
   // Rules from [UAX44] with one exception: implementations are not required
   // to ignore an initial prefix string of "is" in property values.
-  func testProperties() throws {
+  func testProperties() {
     // General_Category
-    expectFirstMatch(input, #/\p{Lu}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{lu}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{uppercase letter}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{Uppercase Letter}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{Uppercase_Letter}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{uppercaseletter}+/#, input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{Lu}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{lu}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{uppercase letter}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{Uppercase Letter}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{Uppercase_Letter}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{uppercaseletter}+"#), input[pos: ..<3])
     
-    expectFirstMatch(input, #/\p{P}+/#, "-–—[]")
-    expectFirstMatch(input, #/\p{Pd}+/#, "-–—")
+    expectFirstMatch(input, regex(#"\p{P}+"#), "-–—[]")
+    expectFirstMatch(input, regex(#"\p{Pd}+"#), "-–—")
     
-    expectFirstMatch(input, #/\p{Any}+/#, input[...])
-    expectFirstMatch(input, #/\p{Assigned}+/#, input[pos: ..<11])
-    expectFirstMatch(input, #/\p{ASCII}+/#, input[pos: ..<8])
+    expectFirstMatch(input, regex(#"\p{Any}+"#), input[...])
+    expectFirstMatch(input, regex(#"\p{Assigned}+"#), input[pos: ..<11])
+    expectFirstMatch(input, regex(#"\p{ASCII}+"#), input[pos: ..<8])
     
     // Script and Script_Extensions
     //    U+3042  あ  HIRAGANA LETTER A  Hira  {Hira}
-    XCTAssertTrue("\u{3042}".contains(#/\p{Hira}/#))
-    XCTAssertTrue("\u{3042}".contains(#/\p{sc=Hira}/#))
-    XCTAssertTrue("\u{3042}".contains(#/\p{scx=Hira}/#))
+    XCTAssertTrue("\u{3042}".contains(regex(#"\p{Hira}"#)))
+    XCTAssertTrue("\u{3042}".contains(regex(#"\p{sc=Hira}"#)))
+    XCTAssertTrue("\u{3042}".contains(regex(#"\p{scx=Hira}"#)))
     //    U+30FC  ー  KATAKANA-HIRAGANA PROLONGED SOUND MARK  Zyyy = Common  {Hira, Kana}
-    XCTAssertTrue("\u{30FC}".contains(#/\p{Hira}/#))      // Implicit = Script_Extensions
-    XCTAssertTrue("\u{30FC}".contains(#/\p{Kana}/#))
-    XCTAssertTrue("\u{30FC}".contains(#/\p{sc=Zyyy}/#))   // Explicit = Script
-    XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Hira}/#))
-    XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Kana}/#))
-    XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Hira}/#))
-    XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Kana}/#))
+    XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Hira}"#)))      // Implicit = Script_Extensions
+    XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Kana}"#)))
+    XCTAssertTrue("\u{30FC}".contains(regex(#"\p{sc=Zyyy}"#)))   // Explicit = Script
+    XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Hira}"#)))
+    XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Kana}"#)))
+    XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Hira}"#)))
+    XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Kana}"#)))
     
     // Uppercase, etc
-    expectFirstMatch(input, #/\p{Uppercase}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{isUppercase}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{Uppercase=true}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{is Uppercase}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{is uppercase = true}+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/\p{lowercase}+/#, input[pos: 3..<11])
-    expectFirstMatch(input, #/\p{whitespace}+/#, input[pos: 12..<13])
+    expectFirstMatch(input, regex(#"\p{Uppercase}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{isUppercase}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{Uppercase=true}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{is Uppercase}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{is uppercase = true}+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"\p{lowercase}+"#), input[pos: 3..<11])
+    expectFirstMatch(input, regex(#"\p{whitespace}+"#), input[pos: 12..<13])
 
     // Block vs Writing System
     let greekScalar = "Θ" // U+0398
     let greekExtendedScalar = "ἀ" // U+1F00
-    XCTAssertTrue(greekScalar.contains(#/\p{Greek}/#))
-    XCTAssertTrue(greekExtendedScalar.contains(#/\p{Greek}/#))
+    XCTAssertTrue(greekScalar.contains(regex(#"\p{Greek}"#)))
+    XCTAssertTrue(greekExtendedScalar.contains(regex(#"\p{Greek}"#)))
   }
   
   func testProperties_XFail() {
@@ -142,19 +155,19 @@ extension UTS18Tests {
   // the Standard Recommendation or POSIX-compatible properties.
   func testCompatibilityProperties() throws {
     // FIXME: These tests seem insufficient
-    expectFirstMatch(input, #/[[:alpha:]]+/#, input[pos: ..<11])
-    expectFirstMatch(input, #/[[:upper:]]+/#, input[pos: ..<3])
-    expectFirstMatch(input, #/[[:lower:]]+/#, input[pos: 3..<11])
-    expectFirstMatch(input, #/[[:punct:]]+/#, input[pos: 13..<18])
-    expectFirstMatch(input, #/[[:digit:]]+/#, input[pos: 18..<21])
-    expectFirstMatch(input, #/[[:xdigit:]]+/#, input[pos: ..<6])
-    expectFirstMatch(input, #/[[:alnum:]]+/#, input[pos: ..<11])
-    expectFirstMatch(input, #/[[:space:]]+/#, input[pos: 12..<13])
+    expectFirstMatch(input, regex(#"[[:alpha:]]+"#), input[pos: ..<11])
+    expectFirstMatch(input, regex(#"[[:upper:]]+"#), input[pos: ..<3])
+    expectFirstMatch(input, regex(#"[[:lower:]]+"#), input[pos: 3..<11])
+    expectFirstMatch(input, regex(#"[[:punct:]]+"#), input[pos: 13..<18])
+    expectFirstMatch(input, regex(#"[[:digit:]]+"#), input[pos: 18..<21])
+    expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6])
+    expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11])
+    expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13])
     // TODO: blank
     // TODO: cntrl
-    expectFirstMatch(input, #/[[:graph:]]+/#, input[pos: ..<11])
-    expectFirstMatch(input, #/[[:print:]]+/#, input[...])
-    expectFirstMatch(input, #/[[:word:]]+/#, input[pos: ..<11])
+    expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11])
+    expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...])
+    expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11])
   }
   
   //RL1.3 Subtraction and Intersection
@@ -162,27 +175,27 @@ extension UTS18Tests {
   // To meet this requirement, an implementation shall supply mechanisms for
   // union, intersection and set-difference of sets of characters within
   // regular expression character class expressions.
-  func testSubtractionAndIntersection() {
+  func testSubtractionAndIntersection() throws {
     // Non-ASCII letters
-    expectFirstMatch(input, #/[\p{Letter}--\p{ASCII}]+/#, input[pos: 8..<11])
+    expectFirstMatch(input, regex(#"[\p{Letter}--\p{ASCII}]+"#), input[pos: 8..<11])
     // Digits that aren't 1 or 2
-    expectFirstMatch(input, #/[\p{digit}--[12]]+/#, input[pos: 20..<21])
+    expectFirstMatch(input, regex(#"[\p{digit}--[12]]+"#), input[pos: 20..<21])
     
     // ASCII-only letters
-    expectFirstMatch(input, #/[\p{Letter}&&\p{ASCII}]+/#, input[pos: ..<8])
+    expectFirstMatch(input, regex(#"[\p{Letter}&&\p{ASCII}]+"#), input[pos: ..<8])
     // Digits that are 2 or 3
-    expectFirstMatch(input, #/[\p{digit}&&[23]]+/#, input[pos: 19..<21])
+    expectFirstMatch(input, regex(#"[\p{digit}&&[23]]+"#), input[pos: 19..<21])
     
     // Non-ASCII lowercase + non-lowercase ASCII
-    expectFirstMatch(input, #/[\p{lowercase}~~\p{ascii}]+/#, input[pos: ..<3])
-    XCTAssertTrue("123%&^ABC".contains(#/^[\p{lowercase}~~\p{ascii}]+$/#))
+    expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3])
+    XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#)))
   }
   
   func testSubtractionAndIntersectionPrecedence() {
-    expectFirstMatch("ABC123-", #/[[:alnum:]]*-/#, "ABC123-")
-    expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}]*-/#, "123-")
+    expectFirstMatch("ABC123-", regex(#"[[:alnum:]]*-"#), "ABC123-")
+    expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}]*-"#), "123-")
     // Union binds more closely than difference
-    expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}[:digit:]]*-/#, "-")
+    expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}[:digit:]]*-"#), "-")
     // TODO: Test for intersection precedence
   }
   
@@ -197,7 +210,7 @@ extension UTS18Tests {
   // - Nonspacing marks are never divided from their base characters, and
   //   otherwise ignored in locating boundaries.
   func testSimpleWordBoundaries() {
-    let simpleWordRegex = #/.+?\b/#.wordBoundaryKind(.unicodeLevel1)
+    let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1)
     expectFirstMatch(input, simpleWordRegex, input[pos: ..<11])
     expectFirstMatch("don't", simpleWordRegex, "don")
     expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café")
@@ -214,17 +227,17 @@ extension UTS18Tests {
   // conversions, then it shall provide at least the simple, default Unicode
   // case folding.
   func testSimpleLooseMatches() {
-    expectFirstMatch("Dåb", #/Dåb/#.ignoresCase(), "Dåb")
-    expectFirstMatch("dÅB", #/Dåb/#.ignoresCase(), "dÅB")
-    expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoresCase(), "D\u{212B}B")
+    expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb")
+    expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB")
+    expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B")
   }
 
   func testSimpleLooseMatches_XFail() {
     XCTExpectFailure("Need case folding support") {
       let sigmas = "σΣς"
-      expectFirstMatch(sigmas, #/σ+/#.ignoresCase(), sigmas[...])
-      expectFirstMatch(sigmas, #/Σ+/#.ignoresCase(), sigmas[...])
-      expectFirstMatch(sigmas, #/ς+/#.ignoresCase(), sigmas[...])
+      expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...])
+      expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...])
+      expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...])
       
       // TODO: Test German sharp S
       // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]]
@@ -252,23 +265,23 @@ extension UTS18Tests {
       
       """
     // Check the input counts
-    var lines = lineInput.matches(of: #/\d{2}/#)
+    var lines = lineInput.matches(of: regex(#"\d{2}"#))
     XCTAssertEqual(lines.count, 11)
     // Test \R - newline sequence
-    lines = lineInput.matches(of: #/\d{2}\R/#)
+    lines = lineInput.matches(of: regex(#"\d{2}\R"#))
     XCTAssertEqual(lines.count, 11)
     // Test anchors as line boundaries
-    lines = lineInput.matches(of: #/^\d{2}$/#.anchorsMatchLineEndings())
+    lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings())
     XCTAssertEqual(lines.count, 11)
     // Test that dot does not match line endings
-    lines = lineInput.matches(of: #/.+/#)
+    lines = lineInput.matches(of: regex(#".+"#))
     XCTAssertEqual(lines.count, 11)
     
     // Does not contain an empty line
-    XCTAssertFalse(lineInput.contains(#/^$/#))
+    XCTAssertFalse(lineInput.contains(regex(#"^$"#)))
     // Does contain an empty line (between \n and \r, which are reversed here)
     let empty = "\n\r"
-    XCTAssertTrue(empty.contains(#/^$/#.anchorsMatchLineEndings()))
+    XCTAssertTrue(empty.contains(regex(#"^$"#).anchorsMatchLineEndings()))
   }
   
   // RL1.7 Supplementary Code Points
@@ -279,9 +292,9 @@ extension UTS18Tests {
   // surrogate followed by a trailing surrogate shall be handled as a single
   // code point in matching.
   func testSupplementaryCodePoints() {
-    XCTAssertTrue("👍".contains(#/\u{1F44D}/#))
-    XCTAssertTrue("👍".contains(#/[\u{1F440}-\u{1F44F}]/#))
-    XCTAssertTrue("👍👎".contains(#/^[\u{1F440}-\u{1F44F}]+$/#))
+    XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#)))
+    XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#)))
+    XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#)))
   }
 }
 
@@ -304,11 +317,11 @@ extension UTS18Tests {
     ]
     
     let regexes = [
-      #/\u{006f}\u{031b}\u{0323}/#,   // o + horn + dot_below
-      #/\u{006f}\u{0323}\u{031b}/#,   // o + dot_below + horn
-      #/\u{01a1}\u{0323}/#,           // o-horn + dot_below
-      #/\u{1ecd}\u{031b}/#,           // o-dot_below + horn
-      #/\u{1ee3}/#,                   // o-horn-dot_below
+      regex(#"\u{006f}\u{031b}\u{0323}"#),   // o + horn + dot_below
+      regex(#"\u{006f}\u{0323}\u{031b}"#),   // o + dot_below + horn
+      regex(#"\u{01a1}\u{0323}"#),           // o-horn + dot_below
+      regex(#"\u{1ecd}\u{031b}"#),           // o-dot_below + horn
+      regex(#"\u{1ee3}"#),                   // o-horn-dot_below
     ]
 
     // Default: Grapheme cluster semantics
@@ -343,14 +356,14 @@ extension UTS18Tests {
   // matching against an arbitrary extended grapheme cluster, Character Classes
   // with Strings, and extended grapheme cluster boundaries.
   func testExtendedGraphemeClusters() {
-    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.$/#))
-    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#))
-    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#.matchingSemantics(.unicodeScalar)))
-    XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.+\y/#.matchingSemantics(.unicodeScalar)))
+    XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.$"#)))
+    XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#)))
+    XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar)))
+    XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar)))
   }
   
   func testCharacterClassesWithStrings() {
-    let regex = #/[a-z🧐🇧🇪🇧🇫🇧🇬]/#
+    let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#)
     XCTAssertTrue("🧐".contains(regex))
     XCTAssertTrue("🇧🇫".contains(regex))
   }
@@ -399,43 +412,43 @@ extension UTS18Tests {
   }
   
   func testIndividuallyNamedCharacters() {
-    XCTAssertTrue("\u{263A}".contains(#/\N{WHITE SMILING FACE}/#))
-    XCTAssertTrue("\u{3B1}".contains(#/\N{GREEK SMALL LETTER ALPHA}/#))
-    XCTAssertTrue("\u{10450}".contains(#/\N{SHAVIAN LETTER PEEP}/#))
+    XCTAssertTrue("\u{263A}".contains(regex(#"\N{WHITE SMILING FACE}"#)))
+    XCTAssertTrue("\u{3B1}".contains(regex(#"\N{GREEK SMALL LETTER ALPHA}"#)))
+    XCTAssertTrue("\u{10450}".contains(regex(#"\N{SHAVIAN LETTER PEEP}"#)))
     
-    XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#))
-    XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#))
-    XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#))
-    XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#))
-    XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#))
+    XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{ZERO WIDTH NO-BREAK SPACE}"#)))
+    XCTAssertTrue("강".contains(regex(#"\N{HANGUL SYLLABLE GANG}"#)))
+    XCTAssertTrue("\u{1F514}".contains(regex(#"\N{BELL}"#)))
+    XCTAssertTrue("🐯".contains(regex(#"\N{TIGER FACE}"#)))
+    XCTAssertFalse("🐯".contains(regex(#"\N{TIEGR FACE}"#)))
 
     // Loose matching
-    XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#))
-    XCTAssertTrue("\u{263A}".contains(#/\N{wHiTe_sMiLiNg_fAcE}/#))
-    XCTAssertTrue("\u{263A}".contains(#/\N{White Smiling-Face}/#))
-    XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#))
+    XCTAssertTrue("\u{263A}".contains(regex(#"\N{whitesmilingface}"#)))
+    XCTAssertTrue("\u{263A}".contains(regex(#"\N{wHiTe_sMiLiNg_fAcE}"#)))
+    XCTAssertTrue("\u{263A}".contains(regex(#"\N{White Smiling-Face}"#)))
+    XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{zerowidthno breakspace}"#)))
 
     // Matching semantic level
-    XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#))
-    XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#))
+    XCTAssertFalse("👩‍👩‍👧‍👦".contains(regex(#".\N{ZERO WIDTH JOINER}"#)))
+    XCTAssertTrue("👩‍👩‍👧‍👦".contains(regex(#"(?u).\N{ZERO WIDTH JOINER}"#)))
   }
 
   func testIndividuallyNamedCharacters_XFail() {
     XCTExpectFailure("Need to support named chars in custom character classes") {
-      XCTFail("\(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)")
+      XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))")
       // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#))
     }
     
     XCTExpectFailure("Other named char failures -- investigate") {
-      XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#))
-      XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#))
-      XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#))
-      XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#))
+      XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#)))
+      XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#)))
+      XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#)))
+      XCTAssertTrue("\u{7}".contains(regex(#"\N{BEL}"#)))
     }
     
     XCTExpectFailure("Need to recognize invalid names at compile time") {
       XCTFail("This should be a compilation error, not a match failure:")
-      XCTAssertFalse("abc".contains(#/\N{NOT AN ACTUAL CHARACTER NAME}/#))
+      XCTAssertFalse("abc".contains(regex(#"\N{NOT AN ACTUAL CHARACTER NAME}"#)))
     }
   }
 
@@ -509,9 +522,9 @@ extension UTS18Tests {
     // Case_Ignorable
     // Changes_When_Lowercased
     // Changes_When_Uppercased
-    XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased}/#))
-    XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased=true}/#))
-    XCTAssertFalse("A".contains(#/\p{Changes_When_Uppercased}/#))
+    XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#)))
+    XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#)))
+    XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#)))
     // Changes_When_Titlecased
     // Changes_When_Casefolded
     // Changes_When_Casemapped

From bb60e493a42a7a3353d770b2ce3bb284601e6014 Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 15:02:11 -0500
Subject: [PATCH 12/13] Fix name string loose equality check

---
 .../_StringProcessing/ConsumerInterface.swift | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
index 7331897f1..d27b89314 100644
--- a/Sources/_StringProcessing/ConsumerInterface.swift
+++ b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -120,27 +120,27 @@ extension String {
   /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E
   /// See https://www.unicode.org/reports/tr44/#Matching_Rules
   fileprivate func isEqualByUAX44LM2(to other: String) -> Bool {
-    var i = startIndex
-    var j = other.startIndex
+    var index = startIndex
+    var otherIndex = other.startIndex
     
-    while i < endIndex {
-      if self[i].isWhitespace || self[i] == "-" || self[i] == "_" {
-        formIndex(after: &i)
+    while index < endIndex && otherIndex < other.endIndex {
+      if self[index].isWhitespace || self[index] == "-" || self[index] == "_" {
+        formIndex(after: &index)
         continue
       }
-      if other[j].isWhitespace || other[j] == "-" || other[j] == "_" {
-        other.formIndex(after: &j)
+      if other[otherIndex].isWhitespace || other[otherIndex] == "-" || other[otherIndex] == "_" {
+        other.formIndex(after: &otherIndex)
         continue
       }
       
-      if self[i] != other[j] && self[i].lowercased() != other[j].lowercased() {
+      if self[index] != other[otherIndex] && self[index].lowercased() != other[otherIndex].lowercased() {
         return false
       }
 
-      formIndex(after: &i)
-      other.formIndex(after: &j)
+      formIndex(after: &index)
+      other.formIndex(after: &otherIndex)
     }
-    return i == endIndex && j == other.endIndex
+    return index == endIndex && otherIndex == other.endIndex
   }
 }
 

From c3c4621ca6352f8ec87f75915f979043fefe4f7f Mon Sep 17 00:00:00 2001
From: Nate Cook <natecook@apple.com>
Date: Thu, 5 May 2022 15:17:50 -0500
Subject: [PATCH 13/13] Revert scalar-by-scalar matching

---
 Sources/_StringProcessing/ByteCodeGen.swift | 9 ---------
 Tests/RegexTests/MatchTests.swift           | 1 -
 2 files changed, 10 deletions(-)

diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift
index dcf543f94..2131d1eb5 100644
--- a/Sources/_StringProcessing/ByteCodeGen.swift
+++ b/Sources/_StringProcessing/ByteCodeGen.swift
@@ -187,16 +187,7 @@ extension Compiler.ByteCodeGen {
           : nil
       }
     } else {
-      let done = builder.makeAddress()
-      let next = builder.makeAddress()
-      builder.buildSave(next)
-      for scalar in c.unicodeScalars {
-        try emitScalar(scalar)
-      }
-      builder.buildBranch(to: done)
-      builder.label(next)
       builder.buildMatch(c)
-      builder.label(done)      
     }
   }
 
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
index 4bf2da106..83b73fe35 100644
--- a/Tests/RegexTests/MatchTests.swift
+++ b/Tests/RegexTests/MatchTests.swift
@@ -1453,7 +1453,6 @@ extension RegexTests {
       (eDecomposed, true))
     // \p{Letter}
     firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed)
-    // FIXME: \p{Letter} doesn't match a decomposed character
     firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed)
     
     // \d