diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 2a612a1de..623589b54 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -14,40 +14,36 @@ @available(SwiftStdlib 5.7, *) extension RegexComponent { /// Returns a regular expression that ignores casing when matching. - public func ignoringCase(_ ignoreCase: Bool = true) -> Regex { - wrapInOption(.caseInsensitive, addingIf: ignoreCase) + public func ignoresCase(_ ignoresCase: Bool = true) -> Regex { + wrapInOption(.caseInsensitive, addingIf: ignoresCase) } /// Returns a regular expression that only matches ASCII characters as "word /// characters". - public func usingASCIIWordCharacters(_ useASCII: Bool = true) -> Regex { - wrapInOption(.asciiOnlyDigit, addingIf: useASCII) + public func asciiOnlyWordCharacters(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlyWord, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters as digits. - public func usingASCIIDigits(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyDigits(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlyDigit, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters as space /// characters. - public func usingASCIISpaces(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyWhitespace(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlySpace, addingIf: useASCII) } /// Returns a regular expression that only matches ASCII characters when /// matching character classes. - public func usingASCIICharacterClasses(_ useASCII: Bool = true) -> Regex { + public func asciiOnlyCharacterClasses(_ useASCII: Bool = true) -> Regex { wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) } - /// Returns a regular expression that uses the Unicode word boundary - /// algorithm. - /// - /// This option is enabled by default; pass `false` to disable use of - /// Unicode's word boundary algorithm. - public func usingUnicodeWordBoundaries(_ useUnicodeWordBoundaries: Bool = true) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: useUnicodeWordBoundaries) + /// Returns a regular expression that uses the specified word boundary algorithm. + public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex { + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) } /// Returns a regular expression where the start and end of input @@ -133,6 +129,7 @@ extension RegexComponent { } @available(SwiftStdlib 5.7, *) +/// A semantic level to use during regex matching. public struct RegexSemanticLevel: Hashable { internal enum Representation { case graphemeCluster @@ -154,6 +151,38 @@ public struct RegexSemanticLevel: Hashable { } } +@available(SwiftStdlib 5.7, *) +/// A word boundary algorithm to use during regex matching. +public struct RegexWordBoundaryKind: Hashable { + internal enum Representation { + case unicodeLevel1 + case unicodeLevel2 + } + + internal var base: Representation + + /// A word boundary algorithm that implements the "simple word boundary" + /// Unicode recommendation. + /// + /// A simple word boundary is a position in the input between two characters + /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input + /// and a `\w` character. Word boundaries therefore depend on the option- + /// defined behavior of `\w`. + public static var unicodeLevel1: Self { + .init(base: .unicodeLevel1) + } + + /// A word boundary algorithm that implements the "default word boundary" + /// Unicode recommendation. + /// + /// Default word boundaries use a Unicode algorithm that handles some cases + /// better than simple word boundaries, such as words with internal + /// punctuation, changes in script, and Emoji. + public static var unicodeLevel2: Self { + .init(base: .unicodeLevel2) + } +} + // MARK: - Helper method @available(SwiftStdlib 5.7, *) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 3d8c4fc2c..897bca8f7 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -228,7 +228,7 @@ class RegexDSLTests: XCTestCase { matchType: Substring.self, ==) { OneOrMore { "abc" - }.ignoringCase(true) + }.ignoresCase(true) } // Multiple options on one component wrap successively, but do not @@ -242,8 +242,8 @@ class RegexDSLTests: XCTestCase { OneOrMore { "abc" } - .ignoringCase(true) - .ignoringCase(false) + .ignoresCase(true) + .ignoresCase(false) } // An option on an outer component doesn't override an option set on an @@ -257,12 +257,36 @@ class RegexDSLTests: XCTestCase { ("abcdeABCdeaBcde", "abcdeABCdeaBcde"), matchType: Substring.self, ==) { OneOrMore { - "abc".ignoringCase(true) + "abc".ignoresCase(true) Optionally("de") } - .ignoringCase(false) + .ignoresCase(false) } +#if os(macOS) + try XCTExpectFailure("Implement level 2 word boundaries") { + try _testDSLCaptures( + ("can't stop won't stop", ("can't stop won't stop", "can't", "won")), + matchType: (Substring, Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + OneOrMore(.any, .reluctantly) + "stop" + " " + + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + .wordBoundaryKind(.unicodeLevel1) + OneOrMore(.any, .reluctantly) + "stop" + } + } +#endif + try _testDSLCaptures( ("abcdef123", ("abcdef123", "a", "123")), matchType: (Substring, Substring, Substring).self, ==) { @@ -280,6 +304,18 @@ class RegexDSLTests: XCTestCase { } ZeroOrMore(.digit) } + + try _testDSLCaptures( + ("abcdefg", ("abcdefg", "abcdefg")), + ("abcdéfg", ("abcdéfg", "abcd")), + matchType: (Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + } + .asciiOnlyWordCharacters() + + ZeroOrMore(.any) + } } func testQuantificationBehavior() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 8e92c5936..4d9ed4d01 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1337,7 +1337,7 @@ extension RegexTests { XCTAssertTrue ("cafe".contains(regex)) XCTAssertFalse("CaFe".contains(regex)) - let caseInsensitiveRegex = regex.ignoringCase() + let caseInsensitiveRegex = regex.ignoresCase() XCTAssertTrue("cafe".contains(caseInsensitiveRegex)) XCTAssertTrue("CaFe".contains(caseInsensitiveRegex)) }