From 0552f74690abd65919d69ef7db87b4f1f141c271 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 18 Apr 2022 14:13:30 -0500 Subject: [PATCH 1/3] Add word boundary kind type --- Sources/_StringProcessing/Regex/Options.swift | 43 ++++++++++++++++--- Tests/RegexBuilderTests/RegexDSLTests.swift | 24 +++++++++++ 2 files changed, 60 insertions(+), 7 deletions(-) diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index d474caae3..d4f76e611 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -41,13 +41,9 @@ extension RegexComponent { wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) } - /// Returns a regular expression that uses the Unicode word boundary - /// algorithm. - /// - /// This option is enabled by default; pass `false` to disable use of - /// Unicode's word boundary algorithm. - public func usingUnicodeWordBoundaries(_ useUnicodeWordBoundaries: Bool = true) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: useUnicodeWordBoundaries) + /// Returns a regular expression that uses the specified word boundary algorithm. + public func identifyingWordBoundaries(with wordBoundaryKind: RegexWordBoundaryKind) -> Regex { + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) } /// Returns a regular expression where the start and end of input @@ -107,6 +103,7 @@ extension RegexComponent { } @available(SwiftStdlib 5.7, *) +/// A semantic level to use during regex matching. public struct RegexSemanticLevel: Hashable { internal enum Representation { case graphemeCluster @@ -128,6 +125,38 @@ public struct RegexSemanticLevel: Hashable { } } +@available(SwiftStdlib 5.7, *) +/// A word boundary algorithm to use during regex matching. +public struct RegexWordBoundaryKind: Hashable { + internal enum Representation { + case unicodeLevel1 + case unicodeLevel2 + } + + internal var base: Representation + + /// A word boundary algorithm that implements the "simple word boundary" + /// Unicode recommendation. + /// + /// A simple word boundary is a position in the input between two characters + /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input + /// and a `\w` character. Word boundaries therefore depend on the option- + /// defined behavior of `\w`. + public static var unicodeLevel1: Self { + .init(base: .unicodeLevel1) + } + + /// A word boundary algorithm that implements the "default word boundary" + /// Unicode recommendation. + /// + /// Default word boundaries use a Unicode algorithm that handles some cases + /// better than simple word boundaries, such as words with internal + /// punctuation, changes in script, and Emoji. + public static var unicodeLevel2: Self { + .init(base: .unicodeLevel2) + } +} + // Options that only affect literals @available(SwiftStdlib 5.7, *) extension RegexComponent { diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 8159ba8ae..bed4659bc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -262,6 +262,30 @@ class RegexDSLTests: XCTestCase { } .ignoringCase(false) } + +#if os(macOS) + try XCTExpectFailure("Implement level 2 word boundaries") { + try _testDSLCaptures( + ("can't stop won't stop", ("can't stop won't stop", "can't", "won")), + matchType: (Substring, Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + OneOrMore(.any, .reluctantly) + "stop" + " " + + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + .identifyingWordBoundaries(with: .unicodeLevel1) + OneOrMore(.any, .reluctantly) + "stop" + } + } +#endif } func testQuantificationBehavior() throws { From 1d68bc84f68945c999a6bc0559f4f4b0a7c81418 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 18 Apr 2022 15:11:12 -0500 Subject: [PATCH 2/3] Skip XFAIL'd test on Linux --- Tests/RegexBuilderTests/RegexDSLTests.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 81d258fd0..98b309a5b 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -263,6 +263,7 @@ class RegexDSLTests: XCTestCase { .ignoringCase(false) } +#if os(macOS) try XCTExpectFailure("Implement level 2 word boundaries") { try _testDSLCaptures( ("can't stop won't stop", ("can't stop won't stop", "can't", "won")), @@ -284,7 +285,8 @@ class RegexDSLTests: XCTestCase { "stop" } } - +#endif + try _testDSLCaptures( ("abcdef123", ("abcdef123", "a", "123")), matchType: (Substring, Substring, Substring).self, ==) { From b63e2ed6994afe845dafcdcb6a636737b951518f Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 18 Apr 2022 15:28:04 -0500 Subject: [PATCH 3/3] Nominalize all matching option modifiers --- Sources/_StringProcessing/Regex/Options.swift | 16 ++++++------- Tests/RegexBuilderTests/RegexDSLTests.swift | 24 ++++++++++++++----- Tests/RegexTests/MatchTests.swift | 2 +- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index b4fc43b1a..623589b54 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -14,35 +14,35 @@ @available(SwiftStdlib 5.7, *) extension RegexComponent { /// Returns a regular expression that ignores casing when matching. - public func ignoringCase(_ ignoreCase: Bool = true) -> Regex { - wrapInOption(.caseInsensitive, addingIf: ignoreCase) + public func ignoresCase(_ ignoresCase: Bool = true) -> Regex { + wrapInOption(.caseInsensitive, addingIf: ignoresCase) } /// Returns a regular expression that only matches ASCII characters as "word /// characters". - public func usingASCIIWordCharacters(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlyDigit, addingIf: useASCII) + public func asciiOnlyWordCharacters(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlyWord, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters as digits. - public func usingASCIIDigits(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyDigits(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlyDigit, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters as space /// characters. - public func usingASCIISpaces(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyWhitespace(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlySpace, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters when /// matching character classes. - public func usingASCIICharacterClasses(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyCharacterClasses(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) } /// Returns a regular expression that uses the specified word boundary algorithm. - public func identifyingWordBoundaries(with wordBoundaryKind: RegexWordBoundaryKind) -> Regex { + public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex { wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 98b309a5b..897bca8f7 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -228,7 +228,7 @@ class RegexDSLTests: XCTestCase { matchType: Substring.self, ==) { OneOrMore { "abc" - }.ignoringCase(true) + }.ignoresCase(true) } // Multiple options on one component wrap successively, but do not @@ -242,8 +242,8 @@ class RegexDSLTests: XCTestCase { OneOrMore { "abc" } - .ignoringCase(true) - .ignoringCase(false) + .ignoresCase(true) + .ignoresCase(false) } // An option on an outer component doesn't override an option set on an @@ -257,10 +257,10 @@ class RegexDSLTests: XCTestCase { ("abcdeABCdeaBcde", "abcdeABCdeaBcde"), matchType: Substring.self, ==) { OneOrMore { - "abc".ignoringCase(true) + "abc".ignoresCase(true) Optionally("de") } - .ignoringCase(false) + .ignoresCase(false) } #if os(macOS) @@ -280,7 +280,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.word) Anchor.wordBoundary } - .identifyingWordBoundaries(with: .unicodeLevel1) + .wordBoundaryKind(.unicodeLevel1) OneOrMore(.any, .reluctantly) "stop" } @@ -304,6 +304,18 @@ class RegexDSLTests: XCTestCase { } ZeroOrMore(.digit) } + + try _testDSLCaptures( + ("abcdefg", ("abcdefg", "abcdefg")), + ("abcdéfg", ("abcdéfg", "abcd")), + matchType: (Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + } + .asciiOnlyWordCharacters() + + ZeroOrMore(.any) + } } func testQuantificationBehavior() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 8e92c5936..4d9ed4d01 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1337,7 +1337,7 @@ extension RegexTests { XCTAssertTrue ("cafe".contains(regex)) XCTAssertFalse("CaFe".contains(regex)) - let caseInsensitiveRegex = regex.ignoringCase() + let caseInsensitiveRegex = regex.ignoresCase() XCTAssertTrue("cafe".contains(caseInsensitiveRegex)) XCTAssertTrue("CaFe".contains(caseInsensitiveRegex)) }