diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 36285d7cc..40732255c 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -226,17 +226,53 @@ extension BidirectionalCollection where Element: Comparable { // } } +@available(SwiftStdlib 5.7, *) +struct RegexRangesCollection { + let base: RegexMatchesCollection + + init(string: Substring, regex: Regex) { + self.base = RegexMatchesCollection(base: string, regex: regex) + } +} + +@available(SwiftStdlib 5.7, *) +extension RegexRangesCollection: Sequence { + struct Iterator: IteratorProtocol { + var matchesBase: RegexMatchesCollection.Iterator + + mutating func next() -> Range? { + matchesBase.next().map(\.range) + } + } + + func makeIterator() -> Iterator { + Iterator(matchesBase: base.makeIterator()) + } +} + +@available(SwiftStdlib 5.7, *) +extension RegexRangesCollection: Collection { + typealias Index = RegexMatchesCollection.Index + + var startIndex: Index { base.startIndex } + var endIndex: Index { base.endIndex } + func index(after i: Index) -> Index { base.index(after: i) } + subscript(position: Index) -> Range { base[position].range } +} + // MARK: Regex algorithms -extension BidirectionalCollection where SubSequence == Substring { +extension Collection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @_disfavoredOverload func _ranges( of regex: R - ) -> RangesCollection> { - _ranges(of: RegexConsumer(regex)) + ) -> RegexRangesCollection { + RegexRangesCollection(string: self[...], regex: regex.regex) } +} +extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) func _rangesFromBack( of regex: R diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift index ccc0962d5..a3f876b0e 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Replace.swift @@ -12,21 +12,21 @@ // MARK: `CollectionSearcher` algorithms extension RangeReplaceableCollection { - func _replacing( - _ searcher: Searcher, + func _replacing( + _ ranges: Ranges, with replacement: Replacement, - subrange: Range, maxReplacements: Int = .max - ) -> Self where Searcher.Searched == SubSequence, + ) -> Self where Ranges.Element == Range, Replacement.Element == Element { precondition(maxReplacements >= 0) - var index = subrange.lowerBound var result = Self() - result.append(contentsOf: self[..( - _ searcher: Searcher, - with replacement: Replacement, - maxReplacements: Int = .max - ) -> Self where Searcher.Searched == SubSequence, - Replacement.Element == Element - { - _replacing( - searcher, - with: replacement, - subrange: startIndex..( - _ searcher: Searcher, + _ ranges: Ranges, with replacement: Replacement, maxReplacements: Int = .max - ) where Searcher.Searched == SubSequence, Replacement.Element == Element { + ) where Ranges.Element == Range, Replacement.Element == Element { self = _replacing( - searcher, + ranges, with: replacement, maxReplacements: maxReplacements) } @@ -85,9 +71,8 @@ extension RangeReplaceableCollection where Element: Equatable { maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { _replacing( - ZSearcher(pattern: Array(other), by: ==), + self[subrange]._ranges(of: other), with: replacement, - subrange: subrange, maxReplacements: maxReplacements) } @@ -143,9 +128,8 @@ extension RangeReplaceableCollection maxReplacements: Int = .max ) -> Self where C.Element == Element, Replacement.Element == Element { _replacing( - PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(other))), + self[subrange]._ranges(of: other), with: replacement, - subrange: subrange, maxReplacements: maxReplacements) } @@ -195,9 +179,8 @@ extension RangeReplaceableCollection where SubSequence == Substring { maxReplacements: Int = .max ) -> Self where Replacement.Element == Element { _replacing( - RegexConsumer(regex), + self[subrange]._ranges(of: regex), with: replacement, - subrange: subrange, maxReplacements: maxReplacements) } diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift index a7cd17779..094d3dfdd 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift @@ -183,13 +183,155 @@ extension BidirectionalCollection { // MARK: Regex algorithms +@available(SwiftStdlib 5.7, *) +struct RegexMatchesCollection { + let input: Substring + let regex: Regex + let startIndex: Index + + init(base: Substring, regex: Regex) { + self.input = base + self.regex = regex + self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end + } +} + +@available(SwiftStdlib 5.7, *) +extension RegexMatchesCollection: Sequence { + /// Returns the index to start searching for the next match after `match`. + fileprivate func searchIndex(after match: Regex.Match) -> String.Index? { + if !match.range.isEmpty { + return match.range.upperBound + } + + // If the last match was an empty match, advance by one position and + // run again, unless at the end of `input`. + if match.range.lowerBound == input.endIndex { + return nil + } + + switch regex.initialOptions.semanticLevel { + case .graphemeCluster: + return input.index(after: match.range.upperBound) + case .unicodeScalar: + return input.unicodeScalars.index(after: match.range.upperBound) + } + } + + struct Iterator: IteratorProtocol { + let base: RegexMatchesCollection + + // Because `RegexMatchesCollection` eagerly computes the first match for + // its `startIndex`, the iterator can use that match for its initial + // iteration. For subsequent calls to `next()`, this value is `false`, and + // `nextStart` is used to search for the next match. + var initialIteration = true + var nextStart: String.Index? + + init(_ matches: RegexMatchesCollection) { + self.base = matches + self.nextStart = base.startIndex.match.flatMap(base.searchIndex(after:)) + } + + mutating func next() -> Regex.Match? { + // Initial case with pre-computed first match + if initialIteration { + initialIteration = false + return base.startIndex.match + } + + // `nextStart` is `nil` when iteration has completed + guard let start = nextStart else { + return nil + } + + // Otherwise, find the next match (if any) and compute `nextStart` + let match = try? base.regex.firstMatch(in: base.input[start...]) + nextStart = match.flatMap(base.searchIndex(after:)) + return match + } + } + + func makeIterator() -> Iterator { + Iterator(self) + } +} + +@available(SwiftStdlib 5.7, *) +extension RegexMatchesCollection: Collection { + enum Index: Comparable { + case match(Regex.Match) + case end + + var match: Regex.Match? { + switch self { + case .match(let match): return match + case .end: return nil + } + } + + static func == (lhs: Self, rhs: Self) -> Bool { + switch (lhs, rhs) { + case (.match(let lhs), .match(let rhs)): + return lhs.range == rhs.range + case (.end, .end): + return true + case (.end, .match), (.match, .end): + return false + } + } + + static func < (lhs: Self, rhs: Self) -> Bool { + switch (lhs, rhs) { + case (.match(let lhs), .match(let rhs)): + // This implementation uses a tuple comparison so that an empty + // range `i.. Index { + guard let currentMatch = i.match else { + fatalError("Can't advance past the 'endIndex' of a match collection.") + } + + guard + let start = searchIndex(after: currentMatch), + let nextMatch = try? regex.firstMatch(in: input[start...]) + else { + return .end + } + return Index.match(nextMatch) + } + + subscript(position: Index) -> Regex.Match { + guard let match = position.match else { + fatalError("Can't subscript the 'endIndex' of a match collection.") + } + return match + } +} + extension BidirectionalCollection where SubSequence == Substring { @available(SwiftStdlib 5.7, *) @_disfavoredOverload func _matches( of regex: R - ) -> MatchesCollection> { - _matches(of: RegexConsumer(regex)) + ) -> RegexMatchesCollection { + RegexMatchesCollection(base: self[...], regex: regex.regex) } @available(SwiftStdlib 5.7, *) @@ -207,30 +349,6 @@ extension BidirectionalCollection where SubSequence == Substring { public func matches( of r: some RegexComponent ) -> [Regex.Match] { - let slice = self[...] - var start = self.startIndex - let end = self.endIndex - let regex = r.regex - - var result = [Regex.Match]() - while start <= end { - guard let match = try? regex._firstMatch( - slice.base, in: start.. = matches + + XCTAssertEqual(matches.map(\.output), expected) + + let i = matches.index(matches.startIndex, offsetBy: 3) + XCTAssertEqual(matches[i].output, expected[3]) + let j = matches.index(i, offsetBy: 5) + XCTAssertEqual(j, matches.endIndex) + + var index = matches.startIndex + while index < matches.endIndex { + XCTAssertEqual( + matches[index].output, + expected[matches.distance(from: matches.startIndex, to: index)]) + matches.formIndex(after: &index) + } + } } diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index 1a5bc34df..175746f71 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -498,6 +498,25 @@ class AlgorithmTests: XCTestCase { s2.ranges(of: try Regex("a*?")).map(s2.offsets(of:)), [0..<0, 1..<1, 2..<2]) } + func testUnicodeScalarSemantics() throws { + let regex = try Regex(#"."#, as: Substring.self).matchingSemantics(.unicodeScalar) + let emptyRegex = try Regex(#"z?"#, as: Substring.self).matchingSemantics(.unicodeScalar) + + XCTAssertEqual("".matches(of: regex).map(\.output), []) + XCTAssertEqual("Café".matches(of: regex).map(\.output), ["C", "a", "f", "é"]) + XCTAssertEqual("Cafe\u{301}".matches(of: regex).map(\.output), ["C", "a", "f", "e", "\u{301}"]) + XCTAssertEqual("Cafe\u{301}".matches(of: emptyRegex).count, 6) + + XCTAssertEqual("Café".ranges(of: regex).count, 4) + XCTAssertEqual("Cafe\u{301}".ranges(of: regex).count, 5) + XCTAssertEqual("Cafe\u{301}".ranges(of: emptyRegex).count, 6) + + XCTAssertEqual("Café".replacing(regex, with: "-"), "----") + XCTAssertEqual("Cafe\u{301}".replacing(regex, with: "-"), "-----") + XCTAssertEqual("Café".replacing(emptyRegex, with: "-"), "-C-a-f-é-") + XCTAssertEqual("Cafe\u{301}".replacing(emptyRegex, with: "-"), "-C-a-f-e-\u{301}-") + } + func testSwitches() { switch "abcde" { case try! Regex("a.*f"):