diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 2a612a1de..b4fc43b1a 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -41,13 +41,9 @@ extension RegexComponent { wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) } - /// Returns a regular expression that uses the Unicode word boundary - /// algorithm. - /// - /// This option is enabled by default; pass `false` to disable use of - /// Unicode's word boundary algorithm. - public func usingUnicodeWordBoundaries(_ useUnicodeWordBoundaries: Bool = true) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: useUnicodeWordBoundaries) + /// Returns a regular expression that uses the specified word boundary algorithm. + public func identifyingWordBoundaries(with wordBoundaryKind: RegexWordBoundaryKind) -> Regex { + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) } /// Returns a regular expression where the start and end of input @@ -133,6 +129,7 @@ extension RegexComponent { } @available(SwiftStdlib 5.7, *) +/// A semantic level to use during regex matching. public struct RegexSemanticLevel: Hashable { internal enum Representation { case graphemeCluster @@ -154,6 +151,38 @@ public struct RegexSemanticLevel: Hashable { } } +@available(SwiftStdlib 5.7, *) +/// A word boundary algorithm to use during regex matching. +public struct RegexWordBoundaryKind: Hashable { + internal enum Representation { + case unicodeLevel1 + case unicodeLevel2 + } + + internal var base: Representation + + /// A word boundary algorithm that implements the "simple word boundary" + /// Unicode recommendation. + /// + /// A simple word boundary is a position in the input between two characters + /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input + /// and a `\w` character. Word boundaries therefore depend on the option- + /// defined behavior of `\w`. + public static var unicodeLevel1: Self { + .init(base: .unicodeLevel1) + } + + /// A word boundary algorithm that implements the "default word boundary" + /// Unicode recommendation. + /// + /// Default word boundaries use a Unicode algorithm that handles some cases + /// better than simple word boundaries, such as words with internal + /// punctuation, changes in script, and Emoji. + public static var unicodeLevel2: Self { + .init(base: .unicodeLevel2) + } +} + // MARK: - Helper method @available(SwiftStdlib 5.7, *) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 3d8c4fc2c..98b309a5b 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -263,6 +263,30 @@ class RegexDSLTests: XCTestCase { .ignoringCase(false) } +#if os(macOS) + try XCTExpectFailure("Implement level 2 word boundaries") { + try _testDSLCaptures( + ("can't stop won't stop", ("can't stop won't stop", "can't", "won")), + matchType: (Substring, Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + OneOrMore(.any, .reluctantly) + "stop" + " " + + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + .identifyingWordBoundaries(with: .unicodeLevel1) + OneOrMore(.any, .reluctantly) + "stop" + } + } +#endif + try _testDSLCaptures( ("abcdef123", ("abcdef123", "a", "123")), matchType: (Substring, Substring, Substring).self, ==) {