From 8e3d9ac6b64dd78697eb6cb0fd8a2bf0acef26ce Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 12 Apr 2022 07:53:27 -0500 Subject: [PATCH 01/13] Add tests for UTS18 level support --- Package.swift | 4 +- Tests/RegexTests/UTS18Tests.swift | 404 ++++++++++++++++++++++++++++++ 2 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 Tests/RegexTests/UTS18Tests.swift diff --git a/Package.swift b/Package.swift index 47a73ca72..2728b0090 100644 --- a/Package.swift +++ b/Package.swift @@ -55,7 +55,9 @@ let package = Package( ]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"]), + dependencies: ["_StringProcessing"], + swiftSettings: [.unsafeFlags(["-Xfrontend", "-enable-experimental-string-processing"])] + ), .testTarget( name: "RegexBuilderTests", dependencies: ["_StringProcessing", "RegexBuilder"], diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift new file mode 100644 index 000000000..97ba156c4 --- /dev/null +++ b/Tests/RegexTests/UTS18Tests.swift @@ -0,0 +1,404 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest +@testable import _StringProcessing + +class UTS18Tests: XCTestCase { + var input: String { + "ABCdefghîøü\u{FFF0} -–—[]123" + // 012345678901 234567890 + } + +} + +fileprivate extension String { + subscript(pos bounds: R) -> Substring + where R.Bound == Int + { + let bounds = bounds.relative(to: 0..( + _ input: String, + _ r: Regex, + _ output: Output, + file: StaticString = #file, + line: UInt = #line) +{ + XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line) +} + +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {} +#endif + +// MARK: - Basic Unicode Support: Level 1 + +// C1. An implementation claiming conformance to Level 1 of this specification +// shall meet the requirements described in the following sections: +extension UTS18Tests { + // RL1.1 Hex Notation + // + // To meet this requirement, an implementation shall supply a mechanism for + // specifying any Unicode code point (from U+0000 to U+10FFFF), using the + // hexadecimal code point representation. + func testHexNotation() throws { + expectFirstMatch("ab", #/\u{61}\u{62}/#, "ab") + expectFirstMatch("𝄞", #/\u{1D11E}/#, "𝄞") + } + + // 1.1.1 Hex Notation and Normalization + // + // TODO: Does this section make a recommendation? + + // RL1.2 Properties + // To meet this requirement, an implementation shall provide at least a + // minimal list of properties, consisting of the following: + // - General_Category + // - Script and Script_Extensions + // - Alphabetic + // - Uppercase + // - Lowercase + // - White_Space + // - Noncharacter_Code_Point + // - Default_Ignorable_Code_Point + // - ANY, ASCII, ASSIGNED + // The values for these properties must follow the Unicode definitions, and + // include the property and property value aliases from the UCD. Matching of + // Binary, Enumerated, Catalog, and Name values must follow the Matching + // Rules from [UAX44] with one exception: implementations are not required + // to ignore an initial prefix string of "is" in property values. + func testProperties() throws { + // General_Category + expectFirstMatch(input, #/\p{Lu}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{lu}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{uppercase letter}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{Uppercase Letter}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{Uppercase_Letter}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{uppercaseletter}+/#, input[pos: ..<3]) + + expectFirstMatch(input, #/\p{P}+/#, "-–—[]") + expectFirstMatch(input, #/\p{Pd}+/#, "-–—") + + expectFirstMatch(input, #/\p{Any}+/#, input[...]) + expectFirstMatch(input, #/\p{Assigned}+/#, input[pos: ..<11]) + expectFirstMatch(input, #/\p{ASCII}+/#, input[pos: ..<8]) + + // Script and Script_Extensions + // U+3042 あ HIRAGANA LETTER A Hira {Hira} + XCTAssertTrue("\u{3042}".contains(#/\p{Hira}/#)) + XCTAssertTrue("\u{3042}".contains(#/\p{sc=Hira}/#)) + XCTAssertTrue("\u{3042}".contains(#/\p{scx=Hira}/#)) + // U+30FC ー KATAKANA-HIRAGANA PROLONGED SOUND MARK Zyyy = Common {Hira, Kana} + XCTAssertTrue("\u{30FC}".contains(#/\p{Hira}/#)) // Implicit = Script_Extensions + XCTAssertTrue("\u{30FC}".contains(#/\p{Kana}/#)) + XCTAssertTrue("\u{30FC}".contains(#/\p{sc=Zyyy}/#)) // Explicit = Script + XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Hira}/#)) + XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Kana}/#)) + XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Hira}/#)) + XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Kana}/#)) + + // Uppercase, etc + expectFirstMatch(input, #/\p{Uppercase}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{isUppercase}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{Uppercase=true}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{is Uppercase}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{is uppercase = true}+/#, input[pos: ..<3]) + expectFirstMatch(input, #/\p{lowercase}+/#, input[pos: 3..<11]) + expectFirstMatch(input, #/\p{whitespace}+/#, input[pos: 12..<13]) + + // Block vs Writing System + let greekScalar = "Θ" // U+0398 + let greekExtendedScalar = "ἀ" // U+1F00 + XCTAssertTrue(greekScalar.contains(#/\p{Greek}/#)) + XCTAssertTrue(greekExtendedScalar.contains(#/\p{Greek}/#)) + } + + func testProperties_XFail() { + XCTExpectFailure("Need to support 'age' and 'block' properties") { + // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) + XCTFail("\(#/\p{age=3.1}/#)") + // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) + XCTFail("\(#/\p{Block=Greek}/#)") + } + } + + // RL1.2a Compatibility Properties + // To meet this requirement, an implementation shall provide the properties + // listed in Annex C: Compatibility Properties, with the property values as + // listed there. Such an implementation shall document whether it is using + // the Standard Recommendation or POSIX-compatible properties. + func testCompatibilityProperties() throws { + // FIXME: These tests seem insufficient + expectFirstMatch(input, #/[[:alpha:]]+/#, input[pos: ..<11]) + expectFirstMatch(input, #/[[:upper:]]+/#, input[pos: ..<3]) + expectFirstMatch(input, #/[[:lower:]]+/#, input[pos: 3..<11]) + expectFirstMatch(input, #/[[:punct:]]+/#, input[pos: 13..<18]) + expectFirstMatch(input, #/[[:digit:]]+/#, input[pos: 18..<21]) + expectFirstMatch(input, #/[[:xdigit:]]+/#, input[pos: ..<6]) + expectFirstMatch(input, #/[[:alnum:]]+/#, input[pos: ..<11]) + expectFirstMatch(input, #/[[:space:]]+/#, input[pos: 12..<13]) + // TODO: blank + // TODO: cntrl + expectFirstMatch(input, #/[[:graph:]]+/#, input[pos: ..<11]) + expectFirstMatch(input, #/[[:print:]]+/#, input[...]) + expectFirstMatch(input, #/[[:word:]]+/#, input[pos: ..<11]) + } + + //RL1.3 Subtraction and Intersection + // + // To meet this requirement, an implementation shall supply mechanisms for + // union, intersection and set-difference of sets of characters within + // regular expression character class expressions. + func testSubtractionAndIntersection() { + // Non-ASCII letters + expectFirstMatch(input, #/[\p{Letter}--\p{ASCII}]+/#, input[pos: 8..<11]) + // Digits that aren't 1 or 2 + expectFirstMatch(input, #/[\p{digit}--[12]]+/#, input[pos: 20..<21]) + + // ASCII-only letters + expectFirstMatch(input, #/[\p{Letter}&&\p{ASCII}]+/#, input[pos: ..<8]) + // Digits that are 2 or 3 + expectFirstMatch(input, #/[\p{digit}&&[23]]+/#, input[pos: 19..<21]) + + // Non-ASCII lowercase + non-lowercase ASCII + expectFirstMatch(input, #/[\p{lowercase}~~\p{ascii}]+/#, input[pos: ..<3]) + XCTAssertTrue("123%&^ABC".contains(#/^[\p{lowercase}~~\p{ascii}]+$/#)) + } + + func testSubtractionAndIntersectionPrecedence() { + expectFirstMatch("ABC123-", #/[[:alnum:]]*-/#, "ABC123-") + expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}]*-/#, "123-") + // Union binds more closely than difference + expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}[:digit:]]*-/#, "-") + // TODO: Test for intersection precedence + } + + // RL1.4 Simple Word Boundaries + // To meet this requirement, an implementation shall extend the word boundary + // mechanism so that: + // - The class of includes all the Alphabetic values from the + // Unicode character database, from UnicodeData.txt, plus the decimals + // (General_Category=Decimal_Number, or equivalently Numeric_Type=Decimal), + // and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER + // (Join_Control=True). See also Annex C: Compatibility Properties. + // - Nonspacing marks are never divided from their base characters, and + // otherwise ignored in locating boundaries. + func testSimpleWordBoundaries() { + let simpleWordRegex = #/.+?\b/#.usingUnicodeWordBoundaries(false) + expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) + expectFirstMatch("don't", simpleWordRegex, "don") + expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") + } + + // RL1.5 Simple Loose Matches + // + // To meet this requirement, if an implementation provides for case- + // insensitive matching, then it shall provide at least the simple, default + // Unicode case-insensitive matching, and specify which properties are closed + // and which are not. + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the simple, default Unicode + // case folding. + func testSimpleLooseMatches() { + expectFirstMatch("Dåb", #/Dåb/#.ignoringCase(), "Dåb") + expectFirstMatch("dÅB", #/Dåb/#.ignoringCase(), "dÅB") + expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoringCase(), "D\u{212B}B") + } + + func testSimpleLooseMatches_XFail() { + XCTExpectFailure("Need case folding support") { + let sigmas = "σΣς" + expectFirstMatch(sigmas, #/σ+/#.ignoringCase(), sigmas[...]) + expectFirstMatch(sigmas, #/Σ+/#.ignoringCase(), sigmas[...]) + expectFirstMatch(sigmas, #/ς+/#.ignoringCase(), sigmas[...]) + + // TODO: Test German sharp S + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + } + } + + // RL1.6 Line Boundaries + // + // To meet this requirement, if an implementation provides for line-boundary + // testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085), + // PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028). + func testLineBoundaries() { + let lineInput = """ + 01 + 02\r\ + 03\n\ + 04\u{a}\ + 05\u{b}\ + 06\u{c}\ + 07\u{d}\ + 08\u{d}\u{a}\ + 09\u{85}\ + 10\u{2028}\ + 11\u{2029}\ + + """ + // Check the input counts + var lines = lineInput.matches(of: #/\d{2}/#) + XCTAssertEqual(lines.count, 11) + // Test \R - newline sequence + lines = lineInput.matches(of: #/\d{2}\R/#) + XCTAssertEqual(lines.count, 11) + // Test anchors as line boundaries + lines = lineInput.matches(of: #/^\d{2}$/#.anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test that dot does not match line endings + lines = lineInput.matches(of: #/.+/#) + XCTAssertEqual(lines.count, 11) + + // Does not contain an empty line + XCTAssertFalse(lineInput.contains(#/^$/#)) + // Does contain an empty line (between \n and \r, which are reversed here) + let empty = "\n\r" + XCTAssertTrue(empty.contains(#/^$/#.anchorsMatchLineEndings())) + } + + // RL1.7 Supplementary Code Points + // + // To meet this requirement, an implementation shall handle the full range of + // Unicode code points, including values from U+FFFF to U+10FFFF. In + // particular, where UTF-16 is used, a sequence consisting of a leading + // surrogate followed by a trailing surrogate shall be handled as a single + // code point in matching. + func testSupplementaryCodePoints() { + XCTAssertTrue("👍".contains(#/\u{1F44D}/#)) + XCTAssertTrue("👍".contains(#/[\u{1F440}-\u{1F44F}]/#)) + XCTAssertTrue("👍👎".contains(#/^[\u{1F440}-\u{1F44F}]+$/#)) + } +} + +// MARK: - Extended Unicode Support: Level 2 + +// C2. An implementation claiming conformance to Level 2 of this specification +// shall satisfy C1, and meet the requirements described in the following +// sections: +extension UTS18Tests { + // RL2.1 Canonical Equivalents + // + // Specific recommendation? + func testCanonicalEquivalents() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.2 Extended Grapheme Clusters and Character Classes with Strings + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching against an arbitrary extended grapheme cluster, Character Classes + // with Strings, and extended grapheme cluster boundaries. + func testExtendedGraphemeClusters() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + func testCharacterClassesWithStrings() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.3 Default Word Boundaries + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching Unicode default word boundaries. + func testDefaultWordBoundaries() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.4 Default Case Conversion + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the full, default Unicode case + // folding. + func testDefaultCaseConversion() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.5 Name Properties + // + // To meet this requirement, an implementation shall support individually + // named characters. + func testNameProperty_XFail() { + XCTExpectFailure("Need \\p{name=...} support") { + XCTFail("\(#/\p{name=BOM}/#)") + // Name property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) + // Name property and Matching Rules + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#)) + // Name_Alias property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#)) + // Name_Alias property (again) + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#)) + + // Computed name + // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#)) + + // Control character + // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#)) + // Graphic symbol + // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#)) + } + } + + func testIndividuallyNamedCharacters() { + XCTAssertTrue("\u{263A}".contains(#/\N{WHITE SMILING FACE}/#)) + XCTAssertTrue("\u{3B1}".contains(#/\N{GREEK SMALL LETTER ALPHA}/#)) + XCTAssertTrue("\u{10450}".contains(#/\N{SHAVIAN LETTER PEEP}/#)) + + XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#)) + XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#)) + XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#)) + } + + func testIndividuallyNamedCharacters_XFail() { + XCTExpectFailure("Need to support named chars in custom character classes") { + XCTFail("\(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)") + // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) + } + + XCTExpectFailure("Other named char failures -- investigate") { + XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#)) + XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#)) + XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#)) + XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#)) + XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#)) + XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#)) + } + + XCTExpectFailure("Need to recognize invalid names at compile time") { + XCTFail("This should be a compilation error, not a match failure:") + XCTAssertFalse("abc".contains(#/\N{NOT AN ACTUAL CHARACTER NAME}/#)) + } + } + + // RL2.6 Wildcards in Property Values + // + // To meet this requirement, an implementation shall support wildcards in + // Unicode property values. + func testWildcardsInPropertyValues() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.7 Full Properties + // + // To meet this requirement, an implementation shall support all of the + // properties listed below that are in the supported version of the Unicode + // Standard (or Unicode Technical Standard, respectively), with values that + // match the Unicode definitions for that version. + func testFullProperties() { + XCTExpectFailure { XCTFail("Implement tests") } + } +} From d4744adcf8df62320a70e27cb89a514bd21be942 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 14 Apr 2022 06:29:32 -0500 Subject: [PATCH 02/13] Additional UTS18 tests --- Tests/RegexTests/UTS18Tests.swift | 124 +++++++++++++++++++++++++++++- 1 file changed, 121 insertions(+), 3 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 97ba156c4..4fc53d83f 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -10,14 +10,14 @@ //===----------------------------------------------------------------------===// import XCTest -@testable import _StringProcessing +@testable // for internal `matches(of:)` +import _StringProcessing class UTS18Tests: XCTestCase { var input: String { "ABCdefghîøü\u{FFF0} -–—[]123" // 012345678901 234567890 } - } fileprivate extension String { @@ -361,6 +361,8 @@ extension UTS18Tests { XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#)) XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#)) XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#)) + XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#)) + XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#)) } func testIndividuallyNamedCharacters_XFail() { @@ -399,6 +401,122 @@ extension UTS18Tests { // Standard (or Unicode Technical Standard, respectively), with values that // match the Unicode definitions for that version. func testFullProperties() { - XCTExpectFailure { XCTFail("Implement tests") } + // MARK: General + // Name (Name_Alias) + // Block + // Age + // General_Category + // Script (Script_Extensions) + // White_Space + // Alphabetic + // Hangul_Syllable_Type + // Noncharacter_Code_Point + // Default_Ignorable_Code_Point + // Deprecated + // Logical_Order_Exception + // Variation_Selector + + // MARK: Numeric + // Numeric_Value + // Numeric_Type + // Hex_Digit + // ASCII_Hex_Digit + + // MARK: Identifiers + // ID_Continue + // ID_Start + // XID_Continue + // XID_Start + // Pattern_Syntax + // Pattern_White_Space + // Identifier_Status + // Identifier_Type + + // MARK: CJK + // Ideographic + // Unified_Ideograph + // Radical + // IDS_Binary_Operator + // IDS_Trinary_Operator + // Equivalent_Unified_Ideograph + XCTExpectFailure() + XCTFail("Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)") + // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + + // MARK: Case + // Uppercase + // Lowercase + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Simple_Case_Folding + // Soft_Dotted + // Cased + // Case_Ignorable + // Changes_When_Lowercased + // Changes_When_Uppercased + XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased}/#)) + XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased=true}/#)) + XCTAssertFalse("A".contains(#/\p{Changes_When_Uppercased}/#)) + // Changes_When_Titlecased + // Changes_When_Casefolded + // Changes_When_Casemapped + + // MARK: Normalization + // Canonical_Combining_Class + // Decomposition_Type + // NFC_Quick_Check + // NFKC_Quick_Check + // NFD_Quick_Check + // NFKD_Quick_Check + // NFKC_Casefold + // Changes_When_NFKC_Casefolded + + // MARK: Emoji + // Emoji + // Emoji_Presentation + // Emoji_Modifier + // Emoji_Modifier_Base + // Emoji_Component + // Extended_Pictographic + // Basic_Emoji* + // Emoji_Keycap_Sequence* + // RGI_Emoji_Modifier_Sequence* + // RGI_Emoji_Flag_Sequence* + // RGI_Emoji_Tag_Sequence* + // RGI_Emoji_ZWJ_Sequence* + // RGI_Emoji* + + // MARK: Shaping and Rendering + // Join_Control + // Joining_Group + // Joining_Type + // Vertical_Orientation + // Line_Break + // Grapheme_Cluster_Break + // Sentence_Break + // Word_Break + // East_Asian_Width + // Prepended_Concatenation_Mark + + // MARK: Bidirectional + // Bidi_Class + // Bidi_Control + // Bidi_Mirrored + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket + // Bidi_Paired_Bracket_Type + + // MARK: Miscellaneous + // Math + // Quotation_Mark + // Dash + // Sentence_Terminal + // Terminal_Punctuation + // Diacritic + // Extender + // Grapheme_Base + // Grapheme_Extend + // Regional_Indicator } } From 9a2d6237c3aac41aa84544c477b596620e95121e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 19 Apr 2022 12:50:16 -0500 Subject: [PATCH 03/13] Implement canonical equivalence tests --- Tests/RegexTests/UTS18Tests.swift | 64 +++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 12 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 4fc53d83f..de13579b2 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -17,6 +17,7 @@ class UTS18Tests: XCTestCase { var input: String { "ABCdefghîøü\u{FFF0} -–—[]123" // 012345678901 234567890 + // 0 10 20 } } @@ -128,9 +129,9 @@ extension UTS18Tests { func testProperties_XFail() { XCTExpectFailure("Need to support 'age' and 'block' properties") { // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) - XCTFail("\(#/\p{age=3.1}/#)") + XCTFail(#"\(#/\p{age=3.1}/#)"#) // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) - XCTFail("\(#/\p{Block=Greek}/#)") + XCTFail(#"\(#/\p{Block=Greek}/#)"#) } } @@ -196,7 +197,7 @@ extension UTS18Tests { // - Nonspacing marks are never divided from their base characters, and // otherwise ignored in locating boundaries. func testSimpleWordBoundaries() { - let simpleWordRegex = #/.+?\b/#.usingUnicodeWordBoundaries(false) + let simpleWordRegex = #/.+?\b/#.wordBoundaryKind(.unicodeLevel1) expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) expectFirstMatch("don't", simpleWordRegex, "don") expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") @@ -213,17 +214,17 @@ extension UTS18Tests { // conversions, then it shall provide at least the simple, default Unicode // case folding. func testSimpleLooseMatches() { - expectFirstMatch("Dåb", #/Dåb/#.ignoringCase(), "Dåb") - expectFirstMatch("dÅB", #/Dåb/#.ignoringCase(), "dÅB") - expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoringCase(), "D\u{212B}B") + expectFirstMatch("Dåb", #/Dåb/#.ignoresCase(), "Dåb") + expectFirstMatch("dÅB", #/Dåb/#.ignoresCase(), "dÅB") + expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoresCase(), "D\u{212B}B") } func testSimpleLooseMatches_XFail() { XCTExpectFailure("Need case folding support") { let sigmas = "σΣς" - expectFirstMatch(sigmas, #/σ+/#.ignoringCase(), sigmas[...]) - expectFirstMatch(sigmas, #/Σ+/#.ignoringCase(), sigmas[...]) - expectFirstMatch(sigmas, #/ς+/#.ignoringCase(), sigmas[...]) + expectFirstMatch(sigmas, #/σ+/#.ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, #/Σ+/#.ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, #/ς+/#.ignoresCase(), sigmas[...]) // TODO: Test German sharp S // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] @@ -294,7 +295,46 @@ extension UTS18Tests { // // Specific recommendation? func testCanonicalEquivalents() { - XCTExpectFailure { XCTFail("Implement tests") } + let equivalents = [ + "\u{006f}\u{031b}\u{0323}", // o + horn + dot_below + "\u{006f}\u{0323}\u{031b}", // o + dot_below + horn + "\u{01a1}\u{0323}", // o-horn + dot_below + "\u{1ecd}\u{031b}", // o-dot_below + horn + "\u{1ee3}", // o-horn-dot_below + ] + + let regexes = [ + #/\u{006f}\u{031b}\u{0323}/#, // o + horn + dot_below + #/\u{006f}\u{0323}\u{031b}/#, // o + dot_below + horn + #/\u{01a1}\u{0323}/#, // o-horn + dot_below + #/\u{1ecd}\u{031b}/#, // o-dot_below + horn + #/\u{1ee3}/#, // o-horn-dot_below + ] + + // Default: Grapheme cluster semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + XCTAssertTrue( + equiv.contains(regex), + "Grapheme cluster semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } + } + + // Unicode scalar semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + let regex = regex.matchingSemantics(.unicodeScalar) + if regexNum == equivNum { + XCTAssertTrue( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } else { + XCTAssertFalse( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) incorrectly matched with string \(equivNum)") + } + } + } } // RL2.2 Extended Grapheme Clusters and Character Classes with Strings @@ -333,7 +373,7 @@ extension UTS18Tests { // named characters. func testNameProperty_XFail() { XCTExpectFailure("Need \\p{name=...} support") { - XCTFail("\(#/\p{name=BOM}/#)") + XCTFail(#"\(#/\p{name=BOM}/#)"#) // Name property // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) // Name property and Matching Rules @@ -440,7 +480,7 @@ extension UTS18Tests { // IDS_Trinary_Operator // Equivalent_Unified_Ideograph XCTExpectFailure() - XCTFail("Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)") + XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) // MARK: Case From ebd1297cff9e0e9aa3e7eb6d593f53ac5203bb66 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 19 Apr 2022 12:51:24 -0500 Subject: [PATCH 04/13] Fix canonical equivalence at different levels This re-interprets runs of characters and scalars as a quoted literal, and implements the correct semantic level matching for scalars, characters, and quoted literals. --- Sources/_StringProcessing/ByteCodeGen.swift | 56 ++++++++++++++----- .../_StringProcessing/ConsumerInterface.swift | 7 +++ .../Regex/ASTConversion.swift | 12 ++-- Tests/RegexTests/MatchTests.swift | 22 ++++---- 4 files changed, 68 insertions(+), 29 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 86309bb8a..b5350d72a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen { } mutating func emitCharacter(_ c: Character) throws { - // FIXME: Does semantic level matter? + // Unicode scalar matches the specific scalars that comprise a character + if options.semanticLevel == .unicodeScalar { + print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") + for scalar in c.unicodeScalars { + try emitScalar(scalar) + } + return + } + if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in @@ -627,22 +635,44 @@ extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - // TODO: Should this incorporate options? - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() + if options.semanticLevel == .graphemeCluster { + if options.isCaseInsensitive { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } else { + builder.buildMatchSequence(s) + } + } else { + builder.buildConsume { + [caseInsensitive = options.isCaseInsensitive] input, bounds in + // TODO: Case folding + var iterator = s.unicodeScalars.makeIterator() var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) + while let scalar = iterator.next() { + guard currentIndex < bounds.upperBound else { return nil } + if caseInsensitive { + if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { + return nil + } + } else { + if scalar != input.unicodeScalars[currentIndex] { + return nil + } + } + input.unicodeScalars.formIndex(after: ¤tIndex) } return currentIndex } - } else { - builder.buildMatchSequence(s) } case let .regexLiteral(l): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index f77cd322f..8b54f0527 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -131,6 +131,13 @@ extension AST.Atom { } } + var singleScalar: UnicodeScalar? { + switch kind { + case .scalar(let s): return s + default: return nil + } + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 8acbd3b1b..b423d62ae 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -65,13 +65,17 @@ extension AST.Node { // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - guard let char = atom?.singleCharacter else { + if let char = atom?.singleCharacter { + result.append(char) + } else if let scalar = atom?.singleScalar { + result.append(Character(scalar)) + } else { break } - result.append(char) + astChildren.formIndex(after: &idx) } - return result.count <= 1 ? nil : (idx, result) + return result.isEmpty ? nil : (idx, result) } // No need to nest single children concatenations @@ -207,7 +211,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) + case let .scalar(s): return .char(Character(s)) case .any: return .any case let .backreference(r): return .backreference(r) case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4d9ed4d01..c6fb18835 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -938,15 +938,15 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( - #"\u{65}"#, // Scalar 'e' is present in both: - ("Cafe\u{301}", "e"), // composed and - ("Sol Cafe", "e")) // standalone + #"\u{65}"#, // Scalar 'e' is present in both + ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match + ("Sol Cafe", "e")) // standalone is okay firstMatchTests( #"\u{65}\y"#, // Grapheme boundary assertion ("Cafe\u{301}", nil), ("Sol Cafe", "e")) firstMatchTests( - #"\u{65}\Y"#, // Grapheme non-boundary assertion + #"(?u)\u{65}\Y"#, // Grapheme non-boundary assertion ("Cafe\u{301}", "e"), ("Sol Cafe", nil)) } @@ -1353,11 +1353,10 @@ extension RegexTests { // as a character. firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character - firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, - xfail: true) + firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e") + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e", + xfail: true) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, @@ -1381,12 +1380,10 @@ extension RegexTests { (eComposed, true), (eDecomposed, true)) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character matchTest( #"e\u{301}$"#, (eComposed, true), - (eDecomposed, true), - xfail: true) + (eDecomposed, true)) matchTest( #"e$"#, @@ -1472,7 +1469,8 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, + xfail: true) // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character // A CCC of regional indicators x 2 From 439d20341a32a6b762a373298527dd392954b5da Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 20 Apr 2022 12:24:56 -0500 Subject: [PATCH 05/13] .... --- Sources/_StringProcessing/ByteCodeGen.swift | 9 ++++++ .../_CharacterClassModel.swift | 10 ++++-- Tests/RegexTests/MatchTests.swift | 32 ++++++++++++++----- Tests/RegexTests/UTS18Tests.swift | 9 ++++-- 4 files changed, 47 insertions(+), 13 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index b5350d72a..fbabcb5b9 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -187,7 +187,16 @@ extension Compiler.ByteCodeGen { : nil } } else { + let done = builder.makeAddress() + let next = builder.makeAddress() + builder.buildSave(next) + for scalar in c.unicodeScalars { + try emitScalar(scalar) + } + builder.buildBranch(to: done) + builder.label(next) builder.buildMatch(c) + builder.label(done) } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c9762f00e..bd36a6cde 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -177,10 +177,14 @@ public struct _CharacterClassModel: Hashable { return matched ? str.index(after: i) : nil case .unicodeScalar: let c = str.unicodeScalars[i] + var nextIndex = str.unicodeScalars.index(after: i) var matched: Bool switch cc { - case .any: matched = true - case .anyGrapheme: fatalError("Not matched in this mode") + case .any: + matched = true + case .anyGrapheme: + matched = true + nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: @@ -197,7 +201,7 @@ public struct _CharacterClassModel: Hashable { if isInverted { matched.toggle() } - return matched ? str.unicodeScalars.index(after: i) : nil + return matched ? nextIndex : nil } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c6fb18835..d07646077 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -400,7 +400,8 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil)) + ("bb", nil), + xfail: true) firstMatchTests( "a+?a", ("babc", nil), @@ -462,15 +463,11 @@ extension RegexTests { "a{2,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) firstMatchTests( "a{,4}+a", - ("babc", nil), - ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) @@ -478,11 +475,25 @@ extension RegexTests { "a{2,}+a", ("babc", nil), ("baabc", nil), + ("bb", nil)) + + // XFAIL'd versions of the above + firstMatchTests( + "a{2,4}+a", + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{,4}+a", + ("babc", nil), + ("baabc", nil), + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), ("baaaaaaaabc", nil), - ("bb", nil)) - + xfail: true) firstMatchTests( "(?:a{2,4}?b)+", @@ -940,7 +951,11 @@ extension RegexTests { firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match + xfail: true) + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both ("Sol Cafe", "e")) // standalone is okay + firstMatchTests( #"\u{65}\y"#, // Grapheme boundary assertion ("Cafe\u{301}", nil), @@ -1355,7 +1370,8 @@ extension RegexTests { firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e", + // FIXME: Implicit \y at end of match + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, xfail: true) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index de13579b2..733126bcf 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -343,11 +343,16 @@ extension UTS18Tests { // matching against an arbitrary extended grapheme cluster, Character Classes // with Strings, and extended grapheme cluster boundaries. func testExtendedGraphemeClusters() { - XCTExpectFailure { XCTFail("Implement tests") } + XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.$/#)) + XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#)) + XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#.matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.+\y/#.matchingSemantics(.unicodeScalar))) } func testCharacterClassesWithStrings() { - XCTExpectFailure { XCTFail("Implement tests") } + let regex = #/[a-z🧐🇧🇪🇧🇫🇧🇬]/# + XCTAssertTrue("🧐".contains(regex)) + XCTAssertTrue("🇧🇫".contains(regex)) } // RL2.3 Default Word Boundaries From dfd917b20d7fcb6dd32d44b8ecaf537c5a7aaeae Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 20 Apr 2022 13:20:44 -0500 Subject: [PATCH 06/13] Document possessive quantification issues --- Tests/RegexTests/MatchTests.swift | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c6fb18835..e50334c04 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -169,6 +169,8 @@ func firstMatchTest( XCTAssertEqual(found, match, file: file, line: line) } } catch { + // FIXME: This allows non-matches to succeed even when xfail'd + // When xfail == true, this should report failure for match == nil if !xfail && match != nil { XCTFail("\(error)", file: file, line: line) } @@ -182,7 +184,9 @@ func firstMatchTests( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #filePath, + line: UInt = #line ) { for (input, match) in tests { firstMatchTest( @@ -192,7 +196,9 @@ func firstMatchTests( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -483,6 +489,24 @@ extension RegexTests { ("baaaaaaaabc", nil), ("bb", nil)) + // XFAIL'd possessive tests + firstMatchTests( + "a?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a)?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a){2,4}+a", + ("a", nil), + ("aa", nil)) + firstMatchTests( + "(a|a){2,4}+a", + ("aaa", nil), + ("aaaa", nil), + xfail: true) firstMatchTests( "(?:a{2,4}?b)+", From 5adaf13dac6896fd1d525fcceebdf76a5fcaf274 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 21 Apr 2022 10:07:30 -0500 Subject: [PATCH 07/13] Test named chars x semantic level --- Tests/RegexTests/UTS18Tests.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index de13579b2..485eedb72 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -403,6 +403,10 @@ extension UTS18Tests { XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#)) XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#)) XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#)) + + // Matching semantic level + XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#)) + XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#)) } func testIndividuallyNamedCharacters_XFail() { From f83d422d0b6ee243fefbfe9ca932641f6223b51c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 11:15:46 -0500 Subject: [PATCH 08/13] Enable loose matching on \N{...} scalar names --- .../_StringProcessing/ConsumerInterface.swift | 42 +++++++++++++++++-- Tests/RegexTests/UTS18Tests.swift | 10 +++-- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 72e914b69..470050502 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,6 +111,38 @@ extension DSLTree.Atom { } } +extension String { + /// Compares this string to `other` using the loose matching rule UAX44-LM2, + /// which ignores case, whitespace, underscores, and nearly all medial + /// hyphens. + /// + /// FIXME: Only ignore medial hyphens + /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E + /// See https://www.unicode.org/reports/tr44/#Matching_Rules + fileprivate func isEqualByUAX44LM2(to other: String) -> Bool { + var i = startIndex + var j = other.startIndex + + while i < endIndex { + if self[i].isWhitespace || self[i] == "-" || self[i] == "_" { + formIndex(after: &i) + continue + } + if other[j].isWhitespace || other[j] == "-" || other[j] == "_" { + other.formIndex(after: &j) + continue + } + + if self[i] != other[j] && self[i].lowercased() != other[j].lowercased() { + return false + } + + formIndex(after: &i) + other.formIndex(after: &j) + } + return i == endIndex && j == other.endIndex + } +} // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves @@ -174,10 +206,12 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalarProp { - // TODO: alias? casing? - $0.name == name || $0.nameAlias == name - } + return consumeScalar(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM' + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) case .any: assertionFailure( diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 9e96138b1..d76329670 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -408,7 +408,13 @@ extension UTS18Tests { XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#)) XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#)) XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#)) - + + // Loose matching + XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#)) + XCTAssertTrue("\u{263A}".contains(#/\N{wHiTe_sMiLiNg_fAcE}/#)) + XCTAssertTrue("\u{263A}".contains(#/\N{White Smiling-Face}/#)) + XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#)) + // Matching semantic level XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#)) XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#)) @@ -421,9 +427,7 @@ extension UTS18Tests { } XCTExpectFailure("Other named char failures -- investigate") { - XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#)) XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#)) - XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#)) XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#)) XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#)) XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#)) From 1282c706bb29e8affc7e45c60be7b07c22c964fb Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 11:31:38 -0500 Subject: [PATCH 09/13] Make Unicode property classes work with semantics --- .../_StringProcessing/ConsumerInterface.swift | 323 ++++++++++-------- .../Unicode/CharacterProps.swift | 6 + Tests/RegexTests/MatchTests.swift | 8 +- Tests/RegexTests/UTS18Tests.swift | 6 +- 4 files changed, 187 insertions(+), 156 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 470050502..7331897f1 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -353,8 +353,9 @@ extension DSLTree.CustomCharacterClass { } } if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } return nil } @@ -362,38 +363,26 @@ extension DSLTree.CustomCharacterClass { } // NOTE: Conveniences, though not most performant -private func consumeScalarScript( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - Unicode.Script($0) == s - } +typealias ScalarPredicate = (UnicodeScalar) -> Bool + +private func scriptScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script($0) == s } } -private func consumeScalarScriptExtension( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - let extensions = Unicode.Script.extensions(for: $0) - return extensions.contains(s) - } +private func scriptExtensionScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script.extensions(for: $0).contains(s) } } -private func consumeScalarGC( - _ gc: Unicode.GeneralCategory -) -> MEProgram.ConsumeFunction { - consumeScalar { gc == $0.properties.generalCategory } +private func categoryScalarPredicate(_ gc: Unicode.GeneralCategory) -> ScalarPredicate { + { gc == $0.properties.generalCategory } } -private func consumeScalarGCs( - _ gcs: [Unicode.GeneralCategory] -) -> MEProgram.ConsumeFunction { - consumeScalar { gcs.contains($0.properties.generalCategory) } +private func categoriesScalarPredicate(_ gcs: [Unicode.GeneralCategory]) -> ScalarPredicate { + { gcs.contains($0.properties.generalCategory) } } -private func consumeScalarProp( - _ p: @escaping (Unicode.Scalar.Properties) -> Bool -) -> MEProgram.ConsumeFunction { - consumeScalar { p($0.properties) } +private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) -> Bool) -> ScalarPredicate { + { p($0.properties) } } + func consumeScalar( - _ p: @escaping (Unicode.Scalar) -> Bool + _ p: @escaping ScalarPredicate ) -> MEProgram.ConsumeFunction { { input, bounds in // TODO: bounds check? @@ -405,6 +394,37 @@ func consumeScalar( return nil } } +func consumeCharacterWithLeadingScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + if p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} +func consumeCharacterWithSingleScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + + if input[curIdx].hasExactlyOneScalar && p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} + +func consumeFunction( + for opts: MatchingOptions +) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { + opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithLeadingScalar + : consumeScalar +} extension AST.Atom.CharacterProperty { func generateConsumer( @@ -416,16 +436,15 @@ extension AST.Atom.CharacterProperty { ) -> MEProgram.ConsumeFunction { return { input, bounds in if p(input, bounds) != nil { return nil } - // TODO: semantic level + // TODO: bounds check - return input.unicodeScalars.index( - after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } } - // FIXME: Below is largely scalar based, for convenience, - // but we want a comprehensive treatment to semantic mode - // switching. + let consume = consumeFunction(for: opts) let preInversion: MEProgram.ConsumeFunction = try { switch kind { @@ -436,11 +455,16 @@ extension AST.Atom.CharacterProperty { return input.index(after: bounds.lowerBound) } case .assigned: - return consumeScalar { + return consume { $0.properties.generalCategory != .unassigned } case .ascii: - return consumeScalar(\.isASCII) + // Note: ASCII must look at the whole character, not just the first + // scalar. That is, "e\u{301}" is not an ASCII character, even though + // the first scalar is. + return opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar(\.isASCII) + : consumeScalar(\.isASCII) case .generalCategory(let p): return try p.generateConsumer(opts) @@ -451,10 +475,10 @@ extension AST.Atom.CharacterProperty { return value ? cons : invert(cons) case .script(let s): - return consumeScalarScript(s) + return consume(scriptScalarPredicate(s)) case .scriptExtension(let s): - return consumeScalarScriptExtension(s) + return consume(scriptExtensionScalarPredicate(s)) case .posix(let p): return p.generateConsumer(opts) @@ -477,49 +501,48 @@ extension Unicode.BinaryProperty { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { - case .asciiHexDigit: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isHexDigit && $0.isASCIIHexDigit - } + }) case .alphabetic: - return consumeScalarProp(\.isAlphabetic) + return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: break - - - case .bidiMirrored: - return consumeScalarProp(\.isBidiMirrored) + case .bidiMirrored: + return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: - return consumeScalarProp(\.isCased) + return consume(propertyScalarPredicate(\.isCased)) case .compositionExclusion: break case .caseIgnorable: - return consumeScalarProp(\.isCaseIgnorable) + return consume(propertyScalarPredicate(\.isCaseIgnorable)) case .changesWhenCasefolded: - return consumeScalarProp(\.changesWhenCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenCaseFolded)) case .changesWhenCasemapped: - return consumeScalarProp(\.changesWhenCaseMapped) + return consume(propertyScalarPredicate(\.changesWhenCaseMapped)) case .changesWhenNFKCCasefolded: - return consumeScalarProp(\.changesWhenNFKCCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenNFKCCaseFolded)) case .changesWhenLowercased: - return consumeScalarProp(\.changesWhenLowercased) + return consume(propertyScalarPredicate(\.changesWhenLowercased)) case .changesWhenTitlecased: - return consumeScalarProp(\.changesWhenTitlecased) + return consume(propertyScalarPredicate(\.changesWhenTitlecased)) case .changesWhenUppercased: - return consumeScalarProp(\.changesWhenUppercased) + return consume(propertyScalarPredicate(\.changesWhenUppercased)) case .dash: - return consumeScalarProp(\.isDash) + return consume(propertyScalarPredicate(\.isDash)) case .deprecated: - return consumeScalarProp(\.isDeprecated) + return consume(propertyScalarPredicate(\.isDeprecated)) case .defaultIgnorableCodePoint: - return consumeScalarProp(\.isDefaultIgnorableCodePoint) + return consume(propertyScalarPredicate(\.isDefaultIgnorableCodePoint)) case .diacratic: // spelling? - return consumeScalarProp(\.isDiacritic) + return consume(propertyScalarPredicate(\.isDiacritic)) case .emojiModifierBase: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifierBase) + return consume(propertyScalarPredicate(\.isEmojiModifierBase)) } else { throw Unsupported( "isEmojiModifierBase on old OSes") @@ -528,59 +551,59 @@ extension Unicode.BinaryProperty { break case .emojiModifier: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifier) + return consume(propertyScalarPredicate(\.isEmojiModifier)) } else { throw Unsupported("isEmojiModifier on old OSes") } case .emoji: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmoji) + return consume(propertyScalarPredicate(\.isEmoji)) } else { throw Unsupported("isEmoji on old OSes") } case .emojiPresentation: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiPresentation) + return consume(propertyScalarPredicate(\.isEmojiPresentation)) } else { throw Unsupported( "isEmojiPresentation on old OSes") } case .extender: - return consumeScalarProp(\.isExtender) + return consume(propertyScalarPredicate(\.isExtender)) case .extendedPictographic: break // NOTE: Stdlib has this data internally case .fullCompositionExclusion: - return consumeScalarProp(\.isFullCompositionExclusion) + return consume(propertyScalarPredicate(\.isFullCompositionExclusion)) case .graphemeBase: - return consumeScalarProp(\.isGraphemeBase) + return consume(propertyScalarPredicate(\.isGraphemeBase)) case .graphemeExtended: - return consumeScalarProp(\.isGraphemeExtend) + return consume(propertyScalarPredicate(\.isGraphemeExtend)) case .graphemeLink: break case .hexDigit: - return consumeScalarProp(\.isHexDigit) + return consume(propertyScalarPredicate(\.isHexDigit)) case .hyphen: break case .idContinue: - return consumeScalarProp(\.isIDContinue) + return consume(propertyScalarPredicate(\.isIDContinue)) case .ideographic: - return consumeScalarProp(\.isIdeographic) + return consume(propertyScalarPredicate(\.isIdeographic)) case .idStart: - return consumeScalarProp(\.isIDStart) + return consume(propertyScalarPredicate(\.isIDStart)) case .idsBinaryOperator: - return consumeScalarProp(\.isIDSBinaryOperator) + return consume(propertyScalarPredicate(\.isIDSBinaryOperator)) case .idsTrinaryOperator: - return consumeScalarProp(\.isIDSTrinaryOperator) + return consume(propertyScalarPredicate(\.isIDSTrinaryOperator)) case .joinControl: - return consumeScalarProp(\.isJoinControl) + return consume(propertyScalarPredicate(\.isJoinControl)) case .logicalOrderException: - return consumeScalarProp(\.isLogicalOrderException) + return consume(propertyScalarPredicate(\.isLogicalOrderException)) case .lowercase: - return consumeScalarProp(\.isLowercase) + return consume(propertyScalarPredicate(\.isLowercase)) case .math: - return consumeScalarProp(\.isMath) + return consume(propertyScalarPredicate(\.isMath)) case .noncharacterCodePoint: - return consumeScalarProp(\.isNoncharacterCodePoint) + return consume(propertyScalarPredicate(\.isNoncharacterCodePoint)) case .otherAlphabetic: break case .otherDefaultIgnorableCodePoint: @@ -598,37 +621,37 @@ extension Unicode.BinaryProperty { case .otherUppercase: break case .patternSyntax: - return consumeScalarProp(\.isPatternSyntax) + return consume(propertyScalarPredicate(\.isPatternSyntax)) case .patternWhitespace: - return consumeScalarProp(\.isPatternWhitespace) + return consume(propertyScalarPredicate(\.isPatternWhitespace)) case .prependedConcatenationMark: break case .quotationMark: - return consumeScalarProp(\.isQuotationMark) + return consume(propertyScalarPredicate(\.isQuotationMark)) case .radical: - return consumeScalarProp(\.isRadical) + return consume(propertyScalarPredicate(\.isRadical)) case .regionalIndicator: - return consumeScalar { s in + return consume { s in (0x1F1E6...0x1F1FF).contains(s.value) } case .softDotted: - return consumeScalarProp(\.isSoftDotted) + return consume(propertyScalarPredicate(\.isSoftDotted)) case .sentenceTerminal: - return consumeScalarProp(\.isSentenceTerminal) + return consume(propertyScalarPredicate(\.isSentenceTerminal)) case .terminalPunctuation: - return consumeScalarProp(\.isTerminalPunctuation) + return consume(propertyScalarPredicate(\.isTerminalPunctuation)) case .unifiedIdiograph: // spelling? - return consumeScalarProp(\.isUnifiedIdeograph) + return consume(propertyScalarPredicate(\.isUnifiedIdeograph)) case .uppercase: - return consumeScalarProp(\.isUppercase) + return consume(propertyScalarPredicate(\.isUppercase)) case .variationSelector: - return consumeScalarProp(\.isVariationSelector) + return consume(propertyScalarPredicate(\.isVariationSelector)) case .whitespace: - return consumeScalarProp(\.isWhitespace) + return consume(propertyScalarPredicate(\.isWhitespace)) case .xidContinue: - return consumeScalarProp(\.isXIDContinue) + return consume(propertyScalarPredicate(\.isXIDContinue)) case .xidStart: - return consumeScalarProp(\.isXIDStart) + return consume(propertyScalarPredicate(\.isXIDStart)) case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw Unsupported("Unicode-deprecated: \(self)") @@ -643,42 +666,44 @@ extension Unicode.POSIXProperty { func generateConsumer( _ opts: MatchingOptions ) -> MEProgram.ConsumeFunction { - // FIXME: semantic levels, modes, etc + let consume = consumeFunction(for: opts) + + // FIXME: modes, etc switch self { case .alnum: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isAlphabetic || $0.numericType != nil - } + }) case .blank: - return consumeScalar { s in + return consume { s in s.properties.generalCategory == .spaceSeparator || s == "\t" } case .graph: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in !( p.isWhitespace || p.generalCategory == .control || p.generalCategory == .surrogate || p.generalCategory == .unassigned ) - } + }) case .print: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.generalCategory != .control - } + }) case .word: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.isAlphabetic || p.numericType != nil || p.isJoinControl || p.isDash// marks and connectors... - } + }) case .xdigit: - return consumeScalarProp(\.isHexDigit) // or number + return consume(propertyScalarPredicate(\.isHexDigit)) // or number } } @@ -689,113 +714,115 @@ extension Unicode.ExtendedGeneralCategory { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { case .letter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter, .modifierLetter, .otherLetter - ]) + ])) case .mark: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .nonspacingMark, .spacingMark, .enclosingMark - ]) + ])) case .number: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .decimalNumber, .letterNumber, .otherNumber - ]) + ])) case .symbol: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol - ]) + ])) case .punctuation: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .connectorPunctuation, .dashPunctuation, .openPunctuation, .closePunctuation, .initialPunctuation, .finalPunctuation, .otherPunctuation - ]) + ])) case .separator: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .spaceSeparator, .lineSeparator, .paragraphSeparator - ]) + ])) case .other: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .control, .format, .surrogate, .privateUse, .unassigned - ]) + ])) case .casedLetter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter - ]) + ])) case .control: - return consumeScalarGC(.control) + return consume(categoryScalarPredicate(.control)) case .format: - return consumeScalarGC(.format) + return consume(categoryScalarPredicate(.format)) case .unassigned: - return consumeScalarGC(.unassigned) + return consume(categoryScalarPredicate(.unassigned)) case .privateUse: - return consumeScalarGC(.privateUse) + return consume(categoryScalarPredicate(.privateUse)) case .surrogate: - return consumeScalarGC(.surrogate) + return consume(categoryScalarPredicate(.surrogate)) case .lowercaseLetter: - return consumeScalarGC(.lowercaseLetter) + return consume(categoryScalarPredicate(.lowercaseLetter)) case .modifierLetter: - return consumeScalarGC(.modifierLetter) + return consume(categoryScalarPredicate(.modifierLetter)) case .otherLetter: - return consumeScalarGC(.otherLetter) + return consume(categoryScalarPredicate(.otherLetter)) case .titlecaseLetter: - return consumeScalarGC(.titlecaseLetter) + return consume(categoryScalarPredicate(.titlecaseLetter)) case .uppercaseLetter: - return consumeScalarGC(.uppercaseLetter) + return consume(categoryScalarPredicate(.uppercaseLetter)) case .spacingMark: - return consumeScalarGC(.spacingMark) + return consume(categoryScalarPredicate(.spacingMark)) case .enclosingMark: - return consumeScalarGC(.enclosingMark) + return consume(categoryScalarPredicate(.enclosingMark)) case .nonspacingMark: - return consumeScalarGC(.nonspacingMark) + return consume(categoryScalarPredicate(.nonspacingMark)) case .decimalNumber: - return consumeScalarGC(.decimalNumber) + return consume(categoryScalarPredicate(.decimalNumber)) case .letterNumber: - return consumeScalarGC(.letterNumber) + return consume(categoryScalarPredicate(.letterNumber)) case .otherNumber: - return consumeScalarGC(.otherNumber) + return consume(categoryScalarPredicate(.otherNumber)) case .connectorPunctuation: - return consumeScalarGC(.connectorPunctuation) + return consume(categoryScalarPredicate(.connectorPunctuation)) case .dashPunctuation: - return consumeScalarGC(.dashPunctuation) + return consume(categoryScalarPredicate(.dashPunctuation)) case .closePunctuation: - return consumeScalarGC(.closePunctuation) + return consume(categoryScalarPredicate(.closePunctuation)) case .finalPunctuation: - return consumeScalarGC(.finalPunctuation) + return consume(categoryScalarPredicate(.finalPunctuation)) case .initialPunctuation: - return consumeScalarGC(.initialPunctuation) + return consume(categoryScalarPredicate(.initialPunctuation)) case .otherPunctuation: - return consumeScalarGC(.otherPunctuation) + return consume(categoryScalarPredicate(.otherPunctuation)) case .openPunctuation: - return consumeScalarGC(.openPunctuation) + return consume(categoryScalarPredicate(.openPunctuation)) case .currencySymbol: - return consumeScalarGC(.currencySymbol) + return consume(categoryScalarPredicate(.currencySymbol)) case .modifierSymbol: - return consumeScalarGC(.modifierSymbol) + return consume(categoryScalarPredicate(.modifierSymbol)) case .mathSymbol: - return consumeScalarGC(.mathSymbol) + return consume(categoryScalarPredicate(.mathSymbol)) case .otherSymbol: - return consumeScalarGC(.otherSymbol) + return consume(categoryScalarPredicate(.otherSymbol)) case .lineSeparator: - return consumeScalarGC(.lineSeparator) + return consume(categoryScalarPredicate(.lineSeparator)) case .paragraphSeparator: - return consumeScalarGC(.paragraphSeparator) + return consume(categoryScalarPredicate(.paragraphSeparator)) case .spaceSeparator: - return consumeScalarGC(.spaceSeparator) + return consume(categoryScalarPredicate(.spaceSeparator)) } } } diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index cfa68c425..80f6819a6 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -12,3 +12,9 @@ // TODO +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + var hasExactlyOneScalar: Bool { + unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex + } +} diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f999f2fe5..4bf2da106 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -499,7 +499,7 @@ extension RegexTests { ("baaabc", nil), ("baaaaabc", nil), ("baaaaaaaabc", nil), - ("bb", nil)) + xfail: true) // XFAIL'd possessive tests firstMatchTests( @@ -1454,8 +1454,7 @@ extension RegexTests { // \p{Letter} firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) // FIXME: \p{Letter} doesn't match a decomposed character - firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed) // \d firstMatchTest(#"\d"#, input: "5", match: "5") @@ -1560,8 +1559,7 @@ extension RegexTests { // FIXME: \O is unsupported firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) - firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\O"#, input: eComposed, match: eComposed) firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index d76329670..95f820bc1 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -15,9 +15,9 @@ import _StringProcessing class UTS18Tests: XCTestCase { var input: String { - "ABCdefghîøü\u{FFF0} -–—[]123" - // 012345678901 234567890 - // 0 10 20 + "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" + // 01234567890 1 234567890 + // 0 10 20 } } From 2dec7fd06ef00251159e91b94979fa0d1b9e84e3 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 11:54:39 -0500 Subject: [PATCH 10/13] Fix up an expected failure block --- Tests/RegexTests/UTS18Tests.swift | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 95f820bc1..0b20648ad 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -492,9 +492,10 @@ extension UTS18Tests { // IDS_Binary_Operator // IDS_Trinary_Operator // Equivalent_Unified_Ideograph - XCTExpectFailure() - XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) - // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) + // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + } // MARK: Case // Uppercase From fb1324bbc7a3b167e0b097701e514271400d23d7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 14:19:01 -0500 Subject: [PATCH 11/13] Remove regex literals from UTS-18 tests --- Package.swift | 1 - Tests/RegexTests/UTS18Tests.swift | 221 ++++++++++++++++-------------- 2 files changed, 117 insertions(+), 105 deletions(-) diff --git a/Package.swift b/Package.swift index 5e90cba7f..8303fc5cb 100644 --- a/Package.swift +++ b/Package.swift @@ -67,7 +67,6 @@ let package = Package( name: "RegexTests", dependencies: ["_StringProcessing"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-enable-experimental-string-processing"]), .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 0b20648ad..71f459a1b 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -9,6 +9,15 @@ // //===----------------------------------------------------------------------===// +// This test suite includes tests that verify the behavior of `Regex` as it +// relates to Unicode Technical Standard #18: Unicode Regular Expressions. +// +// Please note: Quotations of UTS18 in this file mostly use 'Character' to mean +// Unicode code point, and 'String' to mean 'sequence of code points' — they +// are not the Swift meanings of those terms. +// +// See https://unicode.org/reports/tr18/ for more. + import XCTest @testable // for internal `matches(of:)` import _StringProcessing @@ -21,6 +30,10 @@ class UTS18Tests: XCTestCase { } } +fileprivate func regex(_ pattern: String) -> Regex { + try! Regex(pattern, as: Substring.self) +} + fileprivate extension String { subscript(pos bounds: R) -> Substring where R.Bound == Int @@ -54,9 +67,9 @@ extension UTS18Tests { // To meet this requirement, an implementation shall supply a mechanism for // specifying any Unicode code point (from U+0000 to U+10FFFF), using the // hexadecimal code point representation. - func testHexNotation() throws { - expectFirstMatch("ab", #/\u{61}\u{62}/#, "ab") - expectFirstMatch("𝄞", #/\u{1D11E}/#, "𝄞") + func testHexNotation() { + expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") + expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") } // 1.1.1 Hex Notation and Normalization @@ -80,50 +93,50 @@ extension UTS18Tests { // Binary, Enumerated, Catalog, and Name values must follow the Matching // Rules from [UAX44] with one exception: implementations are not required // to ignore an initial prefix string of "is" in property values. - func testProperties() throws { + func testProperties() { // General_Category - expectFirstMatch(input, #/\p{Lu}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{lu}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{uppercase letter}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{Uppercase Letter}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{Uppercase_Letter}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{uppercaseletter}+/#, input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercase letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase_Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercaseletter}+"#), input[pos: ..<3]) - expectFirstMatch(input, #/\p{P}+/#, "-–—[]") - expectFirstMatch(input, #/\p{Pd}+/#, "-–—") + expectFirstMatch(input, regex(#"\p{P}+"#), "-–—[]") + expectFirstMatch(input, regex(#"\p{Pd}+"#), "-–—") - expectFirstMatch(input, #/\p{Any}+/#, input[...]) - expectFirstMatch(input, #/\p{Assigned}+/#, input[pos: ..<11]) - expectFirstMatch(input, #/\p{ASCII}+/#, input[pos: ..<8]) + expectFirstMatch(input, regex(#"\p{Any}+"#), input[...]) + expectFirstMatch(input, regex(#"\p{Assigned}+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"\p{ASCII}+"#), input[pos: ..<8]) // Script and Script_Extensions // U+3042 あ HIRAGANA LETTER A Hira {Hira} - XCTAssertTrue("\u{3042}".contains(#/\p{Hira}/#)) - XCTAssertTrue("\u{3042}".contains(#/\p{sc=Hira}/#)) - XCTAssertTrue("\u{3042}".contains(#/\p{scx=Hira}/#)) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{scx=Hira}"#))) // U+30FC ー KATAKANA-HIRAGANA PROLONGED SOUND MARK Zyyy = Common {Hira, Kana} - XCTAssertTrue("\u{30FC}".contains(#/\p{Hira}/#)) // Implicit = Script_Extensions - XCTAssertTrue("\u{30FC}".contains(#/\p{Kana}/#)) - XCTAssertTrue("\u{30FC}".contains(#/\p{sc=Zyyy}/#)) // Explicit = Script - XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Hira}/#)) - XCTAssertTrue("\u{30FC}".contains(#/\p{scx=Kana}/#)) - XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Hira}/#)) - XCTAssertFalse("\u{30FC}".contains(#/\p{sc=Kana}/#)) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Hira}"#))) // Implicit = Script_Extensions + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Kana}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{sc=Zyyy}"#))) // Explicit = Script + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Hira}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Kana}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Kana}"#))) // Uppercase, etc - expectFirstMatch(input, #/\p{Uppercase}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{isUppercase}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{Uppercase=true}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{is Uppercase}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{is uppercase = true}+/#, input[pos: ..<3]) - expectFirstMatch(input, #/\p{lowercase}+/#, input[pos: 3..<11]) - expectFirstMatch(input, #/\p{whitespace}+/#, input[pos: 12..<13]) + expectFirstMatch(input, regex(#"\p{Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{isUppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase=true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is uppercase = true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lowercase}+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"\p{whitespace}+"#), input[pos: 12..<13]) // Block vs Writing System let greekScalar = "Θ" // U+0398 let greekExtendedScalar = "ἀ" // U+1F00 - XCTAssertTrue(greekScalar.contains(#/\p{Greek}/#)) - XCTAssertTrue(greekExtendedScalar.contains(#/\p{Greek}/#)) + XCTAssertTrue(greekScalar.contains(regex(#"\p{Greek}"#))) + XCTAssertTrue(greekExtendedScalar.contains(regex(#"\p{Greek}"#))) } func testProperties_XFail() { @@ -142,19 +155,19 @@ extension UTS18Tests { // the Standard Recommendation or POSIX-compatible properties. func testCompatibilityProperties() throws { // FIXME: These tests seem insufficient - expectFirstMatch(input, #/[[:alpha:]]+/#, input[pos: ..<11]) - expectFirstMatch(input, #/[[:upper:]]+/#, input[pos: ..<3]) - expectFirstMatch(input, #/[[:lower:]]+/#, input[pos: 3..<11]) - expectFirstMatch(input, #/[[:punct:]]+/#, input[pos: 13..<18]) - expectFirstMatch(input, #/[[:digit:]]+/#, input[pos: 18..<21]) - expectFirstMatch(input, #/[[:xdigit:]]+/#, input[pos: ..<6]) - expectFirstMatch(input, #/[[:alnum:]]+/#, input[pos: ..<11]) - expectFirstMatch(input, #/[[:space:]]+/#, input[pos: 12..<13]) + expectFirstMatch(input, regex(#"[[:alpha:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:upper:]]+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"[[:lower:]]+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"[[:punct:]]+"#), input[pos: 13..<18]) + expectFirstMatch(input, regex(#"[[:digit:]]+"#), input[pos: 18..<21]) + expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) + expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) // TODO: blank // TODO: cntrl - expectFirstMatch(input, #/[[:graph:]]+/#, input[pos: ..<11]) - expectFirstMatch(input, #/[[:print:]]+/#, input[...]) - expectFirstMatch(input, #/[[:word:]]+/#, input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) + expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) } //RL1.3 Subtraction and Intersection @@ -162,27 +175,27 @@ extension UTS18Tests { // To meet this requirement, an implementation shall supply mechanisms for // union, intersection and set-difference of sets of characters within // regular expression character class expressions. - func testSubtractionAndIntersection() { + func testSubtractionAndIntersection() throws { // Non-ASCII letters - expectFirstMatch(input, #/[\p{Letter}--\p{ASCII}]+/#, input[pos: 8..<11]) + expectFirstMatch(input, regex(#"[\p{Letter}--\p{ASCII}]+"#), input[pos: 8..<11]) // Digits that aren't 1 or 2 - expectFirstMatch(input, #/[\p{digit}--[12]]+/#, input[pos: 20..<21]) + expectFirstMatch(input, regex(#"[\p{digit}--[12]]+"#), input[pos: 20..<21]) // ASCII-only letters - expectFirstMatch(input, #/[\p{Letter}&&\p{ASCII}]+/#, input[pos: ..<8]) + expectFirstMatch(input, regex(#"[\p{Letter}&&\p{ASCII}]+"#), input[pos: ..<8]) // Digits that are 2 or 3 - expectFirstMatch(input, #/[\p{digit}&&[23]]+/#, input[pos: 19..<21]) + expectFirstMatch(input, regex(#"[\p{digit}&&[23]]+"#), input[pos: 19..<21]) // Non-ASCII lowercase + non-lowercase ASCII - expectFirstMatch(input, #/[\p{lowercase}~~\p{ascii}]+/#, input[pos: ..<3]) - XCTAssertTrue("123%&^ABC".contains(#/^[\p{lowercase}~~\p{ascii}]+$/#)) + expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) + XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) } func testSubtractionAndIntersectionPrecedence() { - expectFirstMatch("ABC123-", #/[[:alnum:]]*-/#, "ABC123-") - expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}]*-/#, "123-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]]*-"#), "ABC123-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}]*-"#), "123-") // Union binds more closely than difference - expectFirstMatch("ABC123-", #/[[:alnum:]--\p{Uppercase}[:digit:]]*-/#, "-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}[:digit:]]*-"#), "-") // TODO: Test for intersection precedence } @@ -197,7 +210,7 @@ extension UTS18Tests { // - Nonspacing marks are never divided from their base characters, and // otherwise ignored in locating boundaries. func testSimpleWordBoundaries() { - let simpleWordRegex = #/.+?\b/#.wordBoundaryKind(.unicodeLevel1) + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) expectFirstMatch("don't", simpleWordRegex, "don") expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") @@ -214,17 +227,17 @@ extension UTS18Tests { // conversions, then it shall provide at least the simple, default Unicode // case folding. func testSimpleLooseMatches() { - expectFirstMatch("Dåb", #/Dåb/#.ignoresCase(), "Dåb") - expectFirstMatch("dÅB", #/Dåb/#.ignoresCase(), "dÅB") - expectFirstMatch("D\u{212B}B", #/Dåb/#.ignoresCase(), "D\u{212B}B") + expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") + expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") + expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") } func testSimpleLooseMatches_XFail() { XCTExpectFailure("Need case folding support") { let sigmas = "σΣς" - expectFirstMatch(sigmas, #/σ+/#.ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, #/Σ+/#.ignoresCase(), sigmas[...]) - expectFirstMatch(sigmas, #/ς+/#.ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) // TODO: Test German sharp S // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] @@ -252,23 +265,23 @@ extension UTS18Tests { """ // Check the input counts - var lines = lineInput.matches(of: #/\d{2}/#) + var lines = lineInput.matches(of: regex(#"\d{2}"#)) XCTAssertEqual(lines.count, 11) // Test \R - newline sequence - lines = lineInput.matches(of: #/\d{2}\R/#) + lines = lineInput.matches(of: regex(#"\d{2}\R"#)) XCTAssertEqual(lines.count, 11) // Test anchors as line boundaries - lines = lineInput.matches(of: #/^\d{2}$/#.anchorsMatchLineEndings()) + lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) XCTAssertEqual(lines.count, 11) // Test that dot does not match line endings - lines = lineInput.matches(of: #/.+/#) + lines = lineInput.matches(of: regex(#".+"#)) XCTAssertEqual(lines.count, 11) // Does not contain an empty line - XCTAssertFalse(lineInput.contains(#/^$/#)) + XCTAssertFalse(lineInput.contains(regex(#"^$"#))) // Does contain an empty line (between \n and \r, which are reversed here) let empty = "\n\r" - XCTAssertTrue(empty.contains(#/^$/#.anchorsMatchLineEndings())) + XCTAssertTrue(empty.contains(regex(#"^$"#).anchorsMatchLineEndings())) } // RL1.7 Supplementary Code Points @@ -279,9 +292,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { - XCTAssertTrue("👍".contains(#/\u{1F44D}/#)) - XCTAssertTrue("👍".contains(#/[\u{1F440}-\u{1F44F}]/#)) - XCTAssertTrue("👍👎".contains(#/^[\u{1F440}-\u{1F44F}]+$/#)) + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) + XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) + XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) } } @@ -304,11 +317,11 @@ extension UTS18Tests { ] let regexes = [ - #/\u{006f}\u{031b}\u{0323}/#, // o + horn + dot_below - #/\u{006f}\u{0323}\u{031b}/#, // o + dot_below + horn - #/\u{01a1}\u{0323}/#, // o-horn + dot_below - #/\u{1ecd}\u{031b}/#, // o-dot_below + horn - #/\u{1ee3}/#, // o-horn-dot_below + regex(#"\u{006f}\u{031b}\u{0323}"#), // o + horn + dot_below + regex(#"\u{006f}\u{0323}\u{031b}"#), // o + dot_below + horn + regex(#"\u{01a1}\u{0323}"#), // o-horn + dot_below + regex(#"\u{1ecd}\u{031b}"#), // o-dot_below + horn + regex(#"\u{1ee3}"#), // o-horn-dot_below ] // Default: Grapheme cluster semantics @@ -343,14 +356,14 @@ extension UTS18Tests { // matching against an arbitrary extended grapheme cluster, Character Classes // with Strings, and extended grapheme cluster boundaries. func testExtendedGraphemeClusters() { - XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.$/#)) - XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#)) - XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef\X$/#.matchingSemantics(.unicodeScalar))) - XCTAssertTrue("abcdef🇬🇭".contains(#/abcdef.+\y/#.matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) } func testCharacterClassesWithStrings() { - let regex = #/[a-z🧐🇧🇪🇧🇫🇧🇬]/# + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertTrue("🧐".contains(regex)) XCTAssertTrue("🇧🇫".contains(regex)) } @@ -399,43 +412,43 @@ extension UTS18Tests { } func testIndividuallyNamedCharacters() { - XCTAssertTrue("\u{263A}".contains(#/\N{WHITE SMILING FACE}/#)) - XCTAssertTrue("\u{3B1}".contains(#/\N{GREEK SMALL LETTER ALPHA}/#)) - XCTAssertTrue("\u{10450}".contains(#/\N{SHAVIAN LETTER PEEP}/#)) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{WHITE SMILING FACE}"#))) + XCTAssertTrue("\u{3B1}".contains(regex(#"\N{GREEK SMALL LETTER ALPHA}"#))) + XCTAssertTrue("\u{10450}".contains(regex(#"\N{SHAVIAN LETTER PEEP}"#))) - XCTAssertTrue("\u{FEFF}".contains(#/\N{ZERO WIDTH NO-BREAK SPACE}/#)) - XCTAssertTrue("강".contains(#/\N{HANGUL SYLLABLE GANG}/#)) - XCTAssertTrue("\u{1F514}".contains(#/\N{BELL}/#)) - XCTAssertTrue("🐯".contains(#/\N{TIGER FACE}/#)) - XCTAssertFalse("🐯".contains(#/\N{TIEGR FACE}/#)) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{ZERO WIDTH NO-BREAK SPACE}"#))) + XCTAssertTrue("강".contains(regex(#"\N{HANGUL SYLLABLE GANG}"#))) + XCTAssertTrue("\u{1F514}".contains(regex(#"\N{BELL}"#))) + XCTAssertTrue("🐯".contains(regex(#"\N{TIGER FACE}"#))) + XCTAssertFalse("🐯".contains(regex(#"\N{TIEGR FACE}"#))) // Loose matching - XCTAssertTrue("\u{263A}".contains(#/\N{whitesmilingface}/#)) - XCTAssertTrue("\u{263A}".contains(#/\N{wHiTe_sMiLiNg_fAcE}/#)) - XCTAssertTrue("\u{263A}".contains(#/\N{White Smiling-Face}/#)) - XCTAssertTrue("\u{FEFF}".contains(#/\N{zerowidthno breakspace}/#)) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{whitesmilingface}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{wHiTe_sMiLiNg_fAcE}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{White Smiling-Face}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{zerowidthno breakspace}"#))) // Matching semantic level - XCTAssertFalse("👩‍👩‍👧‍👦".contains(#/.\N{ZERO WIDTH JOINER}/#)) - XCTAssertTrue("👩‍👩‍👧‍👦".contains(#/(?u).\N{ZERO WIDTH JOINER}/#)) + XCTAssertFalse("👩‍👩‍👧‍👦".contains(regex(#".\N{ZERO WIDTH JOINER}"#))) + XCTAssertTrue("👩‍👩‍👧‍👦".contains(regex(#"(?u).\N{ZERO WIDTH JOINER}"#))) } func testIndividuallyNamedCharacters_XFail() { XCTExpectFailure("Need to support named chars in custom character classes") { - XCTFail("\(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)") + XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } XCTExpectFailure("Other named char failures -- investigate") { - XCTAssertTrue("\u{C}".contains(#/\N{FORM FEED}/#)) - XCTAssertTrue("\u{FEFF}".contains(#/\N{BYTE ORDER MARK}/#)) - XCTAssertTrue("\u{FEFF}".contains(#/\N{BOM}/#)) - XCTAssertTrue("\u{7}".contains(#/\N{BEL}/#)) + XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) + XCTAssertTrue("\u{7}".contains(regex(#"\N{BEL}"#))) } XCTExpectFailure("Need to recognize invalid names at compile time") { XCTFail("This should be a compilation error, not a match failure:") - XCTAssertFalse("abc".contains(#/\N{NOT AN ACTUAL CHARACTER NAME}/#)) + XCTAssertFalse("abc".contains(regex(#"\N{NOT AN ACTUAL CHARACTER NAME}"#))) } } @@ -509,9 +522,9 @@ extension UTS18Tests { // Case_Ignorable // Changes_When_Lowercased // Changes_When_Uppercased - XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased}/#)) - XCTAssertTrue("a".contains(#/\p{Changes_When_Uppercased=true}/#)) - XCTAssertFalse("A".contains(#/\p{Changes_When_Uppercased}/#)) + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) // Changes_When_Titlecased // Changes_When_Casefolded // Changes_When_Casemapped From bb60e493a42a7a3353d770b2ce3bb284601e6014 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 15:02:11 -0500 Subject: [PATCH 12/13] Fix name string loose equality check --- .../_StringProcessing/ConsumerInterface.swift | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 7331897f1..d27b89314 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -120,27 +120,27 @@ extension String { /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E /// See https://www.unicode.org/reports/tr44/#Matching_Rules fileprivate func isEqualByUAX44LM2(to other: String) -> Bool { - var i = startIndex - var j = other.startIndex + var index = startIndex + var otherIndex = other.startIndex - while i < endIndex { - if self[i].isWhitespace || self[i] == "-" || self[i] == "_" { - formIndex(after: &i) + while index < endIndex && otherIndex < other.endIndex { + if self[index].isWhitespace || self[index] == "-" || self[index] == "_" { + formIndex(after: &index) continue } - if other[j].isWhitespace || other[j] == "-" || other[j] == "_" { - other.formIndex(after: &j) + if other[otherIndex].isWhitespace || other[otherIndex] == "-" || other[otherIndex] == "_" { + other.formIndex(after: &otherIndex) continue } - if self[i] != other[j] && self[i].lowercased() != other[j].lowercased() { + if self[index] != other[otherIndex] && self[index].lowercased() != other[otherIndex].lowercased() { return false } - formIndex(after: &i) - other.formIndex(after: &j) + formIndex(after: &index) + other.formIndex(after: &otherIndex) } - return i == endIndex && j == other.endIndex + return index == endIndex && otherIndex == other.endIndex } } From c3c4621ca6352f8ec87f75915f979043fefe4f7f Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 5 May 2022 15:17:50 -0500 Subject: [PATCH 13/13] Revert scalar-by-scalar matching --- Sources/_StringProcessing/ByteCodeGen.swift | 9 --------- Tests/RegexTests/MatchTests.swift | 1 - 2 files changed, 10 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index dcf543f94..2131d1eb5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -187,16 +187,7 @@ extension Compiler.ByteCodeGen { : nil } } else { - let done = builder.makeAddress() - let next = builder.makeAddress() - builder.buildSave(next) - for scalar in c.unicodeScalars { - try emitScalar(scalar) - } - builder.buildBranch(to: done) - builder.label(next) builder.buildMatch(c) - builder.label(done) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4bf2da106..83b73fe35 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1453,7 +1453,6 @@ extension RegexTests { (eDecomposed, true)) // \p{Letter} firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) - // FIXME: \p{Letter} doesn't match a decomposed character firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed) // \d