diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3cfbdcbd1..913d6eb3b 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -1,3 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + @_implementationOnly import _RegexParser extension Compiler { @@ -104,12 +118,13 @@ fileprivate extension Compiler.ByteCodeGen { // need to supply both a slice bounds and a per-search bounds. switch kind { case .startOfSubject: - builder.buildAssert { (input, pos, subjectBounds) in + builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } case .endOfSubjectBeforeNewline: - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in if pos == subjectBounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: @@ -122,7 +137,7 @@ fileprivate extension Compiler.ByteCodeGen { } case .endOfSubject: - builder.buildAssert { (input, pos, subjectBounds) in + builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound } @@ -135,16 +150,16 @@ fileprivate extension Compiler.ByteCodeGen { // FIXME: This needs to be based on `searchBounds`, // not the `subjectBounds` given as an argument here - builder.buildAssert { (input, pos, subjectBounds) in false } + builder.buildAssert { (_, _, input, pos, subjectBounds) in false } case .textSegment: - builder.buildAssert { (input, pos, _) in + builder.buildAssert { (_, _, input, pos, _) in // FIXME: Grapheme or word based on options input.isOnGraphemeClusterBoundary(pos) } case .notTextSegment: - builder.buildAssert { (input, pos, _) in + builder.buildAssert { (_, _, input, pos, _) in // FIXME: Grapheme or word based on options !input.isOnGraphemeClusterBoundary(pos) } @@ -155,7 +170,8 @@ fileprivate extension Compiler.ByteCodeGen { // the DSL-based `.startOfLine` anchor should always match the start // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in if pos == subjectBounds.lowerBound { return true } switch semanticLevel { case .graphemeCluster: @@ -165,7 +181,7 @@ fileprivate extension Compiler.ByteCodeGen { } } } else { - builder.buildAssert { (input, pos, subjectBounds) in + builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } @@ -176,7 +192,8 @@ fileprivate extension Compiler.ByteCodeGen { // the DSL-based `.endOfLine` anchor should always match the end // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in if pos == subjectBounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: @@ -186,25 +203,41 @@ fileprivate extension Compiler.ByteCodeGen { } } } else { - builder.buildAssert { (input, pos, subjectBounds) in + builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound } } case .wordBoundary: - // TODO: May want to consider Unicode level - builder.buildAssert { [options] (input, pos, subjectBounds) in - // TODO: How should we handle bounds? - _CharacterClassModel.word.isBoundary( - input, at: pos, bounds: subjectBounds, with: options) + builder.buildAssert { [options] + (cache, maxIndex, input, pos, subjectBounds) in + if options.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return _CharacterClassModel.word.isBoundary( + input, + at: pos, + bounds: subjectBounds, + with: options + ) + } else { + return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) + } } case .notWordBoundary: - // TODO: May want to consider Unicode level - builder.buildAssert { [options] (input, pos, subjectBounds) in - // TODO: How should we handle bounds? - !_CharacterClassModel.word.isBoundary( - input, at: pos, bounds: subjectBounds, with: options) + builder.buildAssert { [options] + (cache, maxIndex, input, pos, subjectBounds) in + if options.usesSimpleUnicodeBoundaries { + // TODO: How should we handle bounds? + return !_CharacterClassModel.word.isBoundary( + input, + at: pos, + bounds: subjectBounds, + with: options + ) + } else { + return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex) + } } } } diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index eb2c119f2..d311b4465 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -16,7 +16,13 @@ struct MEProgram { typealias ConsumeFunction = (Input, Range) -> Input.Index? typealias AssertionFunction = - (Input, Input.Index, Range) throws -> Bool + ( + inout Set?, + inout String.Index?, + Input, + Input.Index, + Range + ) throws -> Bool typealias TransformFunction = (Input, Processor._StoredCapture) throws -> Any? typealias MatcherFunction = diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a5368138c..f7b3a65a2 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -78,6 +78,9 @@ struct Processor { var storedCaptures: Array<_StoredCapture> + var wordIndexCache: Set? = nil + var wordIndexMaxIndex: String.Index? = nil + var state: State = .inProgress var failureReason: Error? = nil @@ -401,7 +404,13 @@ extension Processor { let reg = payload.assertion let assertion = registers[reg] do { - guard try assertion(input, currentPosition, subjectBounds) else { + guard try assertion( + &wordIndexCache, + &wordIndexMaxIndex, + input, + currentPosition, + subjectBounds + ) else { signalFailure() return } diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index a5daa3f73..e56b8def2 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -311,9 +311,12 @@ extension MatchingOptions.Representation { [.reluctantByDefault, .possessiveByDefault] } + // Uses level 2 Unicode word boundaries + static var unicodeWordBoundaries: Self { .init(.unicodeWordBoundaries) } + /// The default set of options. static var `default`: Self { - [.graphemeClusterSemantics, .textSegmentGraphemeMode] + [.graphemeClusterSemantics, .textSegmentGraphemeMode, .unicodeWordBoundaries] } } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift new file mode 100644 index 000000000..94c311e82 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension String { + func isOnWordBoundary( + at i: String.Index, + using cache: inout Set?, + _ maxIndex: inout String.Index? + ) -> Bool { + guard i != startIndex, i != endIndex else { + return true + } + + // If our index is already in our cache, then this is obviously on a + // boundary. + if let cache = cache, cache.contains(i) { + return true + } + + // If its not in the cache AND our max index is larger than our index, it + // means this index is never on a word boundary in our string. If our index + // is larger than max index, we may need to still do work to determine if + // i is on a boundary. If it's equal to max index, then it should've been + // taken the cache path. + if let maxIndex = maxIndex, i < maxIndex { + return false + } + + if #available(SwiftStdlib 5.7, *) { + var indices: Set = [] + var j = maxIndex ?? startIndex + + while j < endIndex, j <= i { + indices.insert(j) + j = _wordIndex(after: j) + } + + cache = indices + maxIndex = j + + return indices.contains(i) + } else { + return false + } + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 24d6f219e..50cdd69f2 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -271,29 +271,44 @@ class RegexDSLTests: XCTestCase { .ignoresCase(false) } -#if os(macOS) - try XCTExpectFailure("Implement level 2 word boundaries") { - try _testDSLCaptures( - ("can't stop won't stop", ("can't stop won't stop", "can't", "won")), - matchType: (Substring, Substring, Substring).self, ==) { - Capture { - OneOrMore(.word) - Anchor.wordBoundary - } - OneOrMore(.any, .reluctant) - "stop" - " " - - Capture { - OneOrMore(.word) - Anchor.wordBoundary - } - .wordBoundaryKind(.unicodeLevel1) - OneOrMore(.any, .reluctant) - "stop" + try _testDSLCaptures( + ("can't stop won't stop", ("can't stop won't stop", "can't", "won't")), + matchType: (Substring, Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + Anchor.wordBoundary } - } -#endif + OneOrMore(.any, .reluctant) + "stop" + " " + + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + OneOrMore(.any, .reluctant) + "stop" + } + + try _testDSLCaptures( + ("can't stop won't stop", ("can't stop won't stop", "can", "won")), + matchType: (Substring, Substring, Substring).self, ==) { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + OneOrMore(.any, .reluctant) + "stop" + " " + + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + .wordBoundaryKind(.unicodeLevel1) + OneOrMore(.any, .reluctant) + "stop" + } try _testDSLCaptures( ("abcdef123", ("abcdef123", "a", "123")), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 51da6d010..7a7524291 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1022,6 +1022,8 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) + // FIXME: Keep this until _wordIndex and friends are +#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1039,6 +1041,7 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) +#endif // TODO: \G and \K do { @@ -1069,6 +1072,26 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } + // FIXME: Keep this until _wordIndex and friends are +#if os(Linux) + func testLevel2WordBoundaries() { + // MARK: Level 2 Word Boundaries + firstMatchTest(#"\b😊\b"#, input: "🔥😊👍", match: "😊") + firstMatchTest(#"\b👨🏽\b"#, input: "👩🏻👶🏿👨🏽🧑🏾👩🏼", match: "👨🏽") + firstMatchTest(#"\b🇺🇸\b"#, input: "🇨🇦🇺🇸🇲🇽", match: "🇺🇸") + firstMatchTest(#"\b.+\b"#, input: "€1 234,56", match: "€1 234,56") + firstMatchTest(#"〱\B㋞\Bツ"#, input: "〱㋞ツ", match: "〱㋞ツ") + firstMatchTest(#"\bhello\b"#, input: "hello〱㋞ツ", match: "hello") + firstMatchTest(#"\bChicago\b"#, input: "나는 Chicago에 산다", match: "Chicago") + firstMatchTest(#"\blove\b"#, input: "眼睛love食物", match: "love") + firstMatchTest(#"\b\u{d}\u{a}\b"#, input: "\u{d}\u{a}", match: "\u{d}\u{a}") + firstMatchTest(#"\bㅋㅋㅋ\b"#, input: "아니ㅋㅋㅋ네", match: "ㅋㅋㅋ") + firstMatchTest(#"Re\B\:\BZero"#, input: "Re:Zero Starting Life in Another World", match: "Re:Zero") + firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") + firstMatchTest(#"\b÷\b"#, input: "3 ÷ 3 = 1", match: "÷") + } +#endif + func testMatchGroups() { // MARK: Groups @@ -1342,6 +1365,8 @@ extension RegexTests { xfail: true ) + // FIXME: Keep this until _wordIndex and friends are +#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1359,6 +1384,7 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) +#endif // Floats flatCaptureTest( @@ -1466,6 +1492,8 @@ extension RegexTests { ("aeiou", true), ("åe\u{301}ïôú", false)) + // FIXME: Keep this until _wordIndex and friends are +#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1475,12 +1503,13 @@ extension RegexTests { #"(?W)abcd\b.+"#, ("abcd ef", true), ("abcdef", false), - ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII + ("abcdéf", false)) matchTest( #"(?P)abcd\b.+"#, ("abcd ef", true), ("abcdef", false), - ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII + ("abcdéf", false)) +#endif // 'S' ASCII-only spaces matchTest(