diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift index e481597f8..9d8e7349d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift @@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring { @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ regex: some RegexComponent) -> Bool { - _contains(RegexConsumer(regex)) + (try? regex.regex.firstMatch(in: self[...])) != nil } } diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 40732255c..fc6b23af2 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -229,9 +229,18 @@ extension BidirectionalCollection where Element: Comparable { @available(SwiftStdlib 5.7, *) struct RegexRangesCollection { let base: RegexMatchesCollection - - init(string: Substring, regex: Regex) { - self.base = RegexMatchesCollection(base: string, regex: regex) + + init( + input: String, + subjectBounds: Range, + searchBounds: Range, + regex: Regex + ) { + self.base = .init( + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + regex: regex) } } @@ -263,12 +272,29 @@ extension RegexRangesCollection: Collection { // MARK: Regex algorithms extension Collection where SubSequence == Substring { + @available(SwiftStdlib 5.7, *) + @_disfavoredOverload + func _ranges( + of regex: R, + subjectBounds: Range, + searchBounds: Range + ) -> RegexRangesCollection { + RegexRangesCollection( + input: self[...].base, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + regex: regex.regex) + } + @available(SwiftStdlib 5.7, *) @_disfavoredOverload func _ranges( of regex: R ) -> RegexRangesCollection { - RegexRangesCollection(string: self[...], regex: regex.regex) + _ranges( + of: regex, + subjectBounds: startIndex.. Self where Replacement.Element == Element { _replacing( - self[subrange]._ranges(of: regex), + self._ranges( + of: regex, + subjectBounds: startIndex.. { - let input: Substring + let input: String + let subjectBounds: Range + let searchBounds: Range let regex: Regex let startIndex: Index - init(base: Substring, regex: Regex) { - self.input = base + init( + input: String, + subjectBounds: Range, + searchBounds: Range, + regex: Regex + ) { + self.input = input + self.subjectBounds = subjectBounds + self.searchBounds = searchBounds self.regex = regex - self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end + self.startIndex = (try? regex._firstMatch( + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds)).map(Index.match) ?? .end } } @@ -241,12 +253,15 @@ extension RegexMatchesCollection: Sequence { } // `nextStart` is `nil` when iteration has completed - guard let start = nextStart else { + guard let start = nextStart, start <= base.searchBounds.upperBound else { return nil } // Otherwise, find the next match (if any) and compute `nextStart` - let match = try? base.regex.firstMatch(in: base.input[start...]) + let match = try? base.regex._firstMatch( + base.input, + subjectBounds: base.subjectBounds, + searchBounds: start..( of regex: R ) -> RegexMatchesCollection { - RegexMatchesCollection(base: self[...], regex: regex.regex) + RegexMatchesCollection( + input: self[...].base, + subjectBounds: startIndex.., + searchBounds: Range + ) -> Processor { + Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: .partialFromFront, + isTracingEnabled: enableTracing) + } } extension Processor { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8471ef861..8fa3716b9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -28,18 +28,44 @@ struct Processor { typealias Input = String typealias Element = Input.Element + /// The base collection of the subject to search. + /// + /// Taken together, `input` and `subjectBounds` define the actual subject + /// of the search. `input` can be a "supersequence" of the subject, while + /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input + + /// The bounds of the logical subject in `input`. + /// + /// `subjectBounds` represents the bounds of the string or substring that a + /// regex operation is invoked upon. Anchors like `^` and `.startOfSubject` + /// always use `subjectBounds` as their reference points, instead of + /// `input`'s boundaries or `searchBounds`. + /// + /// `subjectBounds` is always equal to or a subrange of + /// `input.startIndex.. + + /// The bounds within the subject for an individual search. + /// + /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a + /// subrange when performing operations like searching for matches iteratively + /// or calling `str.replacing(_:with:subrange:)`. + /// + /// Anchors like `^` and `.startOfSubject` use `subjectBounds` instead of + /// `searchBounds`. The "start of matching" anchor `\G` uses `searchBounds` + /// as its starting point. + let searchBounds: Range + let matchMode: MatchMode let instructions: InstructionList // MARK: Resettable state - - // The subject bounds. - // - // FIXME: This also conflates search bounds too! - var bounds: Range - - // The current position in the subject + + /// The current search position while processing. + /// + /// `currentPosition` must always be in the range `subjectBounds` or equal + /// to `subjectBounds.upperBound`. var currentPosition: Position var controller: Controller @@ -56,53 +82,50 @@ struct Processor { var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. var cycleCount = 0 var isTracingEnabled: Bool - } extension Processor { typealias Position = Input.Index - var start: Position { bounds.lowerBound } - var end: Position { bounds.upperBound } + var start: Position { searchBounds.lowerBound } + var end: Position { searchBounds.upperBound } } extension Processor { init( program: MEProgram, input: Input, - bounds: Range, + subjectBounds: Range, + searchBounds: Range, matchMode: MatchMode, isTracingEnabled: Bool ) { self.controller = Controller(pc: 0) self.instructions = program.instructions self.input = input - self.bounds = bounds + self.subjectBounds = subjectBounds + self.searchBounds = searchBounds self.matchMode = matchMode self.isTracingEnabled = isTracingEnabled - self.currentPosition = bounds.lowerBound + self.currentPosition = searchBounds.lowerBound - self.registers = Registers(program, bounds.upperBound) + // Initialize registers with end of search bounds + self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( repeating: .init(), count: program.registerInfo.captures) _checkInvariants() } - - mutating func reset(searchBounds: Range) { - // FIXME: We currently conflate both subject bounds and search bounds - // This should just reset search bounds - self.bounds = searchBounds - self.currentPosition = self.bounds.lowerBound + mutating func reset(currentPosition: Position) { + self.currentPosition = currentPosition self.controller = Controller(pc: 0) - self.registers.reset(sentinel: bounds.upperBound) + self.registers.reset(sentinel: searchBounds.upperBound) self.savePoints.removeAll(keepingCapacity: true) self.callStack.removeAll(keepingCapacity: true) @@ -118,10 +141,12 @@ extension Processor { } func _checkInvariants() { - assert(end <= input.endIndex) - assert(start >= input.startIndex) - assert(currentPosition >= start) - assert(currentPosition <= end) + assert(searchBounds.lowerBound >= subjectBounds.lowerBound) + assert(searchBounds.upperBound <= subjectBounds.upperBound) + assert(subjectBounds.lowerBound >= input.startIndex) + assert(subjectBounds.upperBound <= input.endIndex) + assert(currentPosition >= searchBounds.lowerBound) + assert(currentPosition <= searchBounds.upperBound) } } @@ -129,7 +154,7 @@ extension Processor { var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? - input[bounds] + input[searchBounds] } // Advance in our input, without any checks or failure signalling @@ -158,8 +183,8 @@ extension Processor { /// - Precondition: `bounds.contains(index) || index == bounds.upperBound` /// - Precondition: `index >= currentPosition` mutating func resume(at index: Input.Index) { - assert(index >= bounds.lowerBound) - assert(index <= bounds.upperBound) + assert(index >= searchBounds.lowerBound) + assert(index <= searchBounds.upperBound) assert(index >= currentPosition) currentPosition = index } @@ -230,7 +255,7 @@ extension Processor { switch (currentPosition, matchMode) { // When reaching the end of the match bounds or when we are only doing a // prefix match, transition to accept. - case (bounds.upperBound, _), (_, .partialFromFront): + case (searchBounds.upperBound, _), (_, .partialFromFront): state = .accept // When we are doing a full match but did not reach the end of the match @@ -341,9 +366,9 @@ extension Processor { case .consumeBy: let reg = payload.consumer - guard currentPosition < bounds.upperBound, + guard currentPosition < searchBounds.upperBound, let nextIndex = registers[reg]( - input, currentPosition..( _ input: String, - in inputRange: Range, + subjectBounds: Range, + searchBounds: Range, graphemeSemantic: Bool ) throws -> Regex.Match? { - var cpu = engine.makeProcessor( - input: input, bounds: inputRange, matchMode: .partialFromFront) + var cpu = engine.makeFirstMatchProcessor( + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds) - var low = inputRange.lowerBound - let high = inputRange.upperBound + var low = searchBounds.lowerBound + let high = searchBounds.upperBound while true { if let m: Regex.Match = try _match( - input, in: low..( _ input: String, - in inputRange: Range, + in subjectBounds: Range, _ mode: MatchMode ) throws -> Regex.Match? { var cpu = engine.makeProcessor( - input: input, bounds: inputRange, matchMode: mode) - return try _match(input, in: inputRange, using: &cpu) + input: input, bounds: subjectBounds, matchMode: mode) + return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } @available(SwiftStdlib 5.7, *) func _match( _ input: String, - in inputRange: Range, + from currentPosition: String.Index, using cpu: inout Processor ) throws -> Regex.Match? { + // FIXME: currentPosition is already encapsulated in cpu, don't pass in + // FIXME: cpu.consume() should return the matched range, not the upper bound guard let endIdx = cpu.consume() else { if let e = cpu.failureReason { throw e @@ -74,7 +79,7 @@ struct Executor { values: cpu.storedCaptures, referencedCaptureOffsets: engine.program.referencedCaptureOffsets) - let range = inputRange.lowerBound.., + in subjectBounds: Range, _ mode: MatchMode ) throws -> Regex.Match? { - try match(input, in: inputRange, mode) + try match(input, in: subjectBounds, mode) } } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 156dd7220..8f9d0e010 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -145,6 +145,10 @@ extension Regex where Output == AnyRegexOutput { public init(_ pattern: String) throws { self.init(ast: try parse(pattern, .semantic, .traditional)) } + + internal init(_ pattern: String, syntax: SyntaxOptions) throws { + self.init(ast: try parse(pattern, .semantic, syntax)) + } } @available(SwiftStdlib 5.7, *) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 7e4be5652..de63df036 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -126,21 +126,32 @@ extension Regex { func _match( _ input: String, - in inputRange: Range, + in subjectBounds: Range, mode: MatchMode = .wholeString ) throws -> Regex.Match? { let executor = Executor(program: regex.program.loweredProgram) - return try executor.match(input, in: inputRange, mode) + return try executor.match(input, in: subjectBounds, mode) } func _firstMatch( _ input: String, - in inputRange: Range + in subjectBounds: Range + ) throws -> Regex.Match? { + try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) + } + + func _firstMatch( + _ input: String, + subjectBounds: Range, + searchBounds: Range ) throws -> Regex.Match? { let executor = Executor(program: regex.program.loweredProgram) let graphemeSemantic = regex.initialOptions.semanticLevel == .graphemeCluster return try executor.firstMatch( - input, in: inputRange, graphemeSemantic: graphemeSemantic) + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + graphemeSemantic: graphemeSemantic) } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 352c267a5..db2088782 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -577,12 +577,12 @@ extension _CharacterClassModel { ) -> Bool { // FIXME: How should we handle bounds? // We probably need two concepts - if input.isEmpty { return false } - if pos == input.startIndex { + if bounds.isEmpty { return false } + if pos == bounds.lowerBound { return self.matches(in: input, at: pos, with: options) != nil } let priorIdx = input.index(before: pos) - if pos == input.endIndex { + if pos == bounds.upperBound { return self.matches(in: input, at: priorIdx, with: options) != nil } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index fc31e575f..24d6f219e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -18,6 +18,7 @@ class RegexDSLTests: XCTestCase { _ tests: (input: String, expectedCaptures: MatchType?)..., matchType: MatchType.Type, _ equivalence: (MatchType, MatchType) -> Bool, + xfail: Bool = false, file: StaticString = #file, line: UInt = #line, @RegexComponentBuilder _ content: () -> Content @@ -25,8 +26,13 @@ class RegexDSLTests: XCTestCase { let regex = content() for (input, maybeExpectedCaptures) in tests { let maybeMatch = input.wholeMatch(of: regex) - if let expectedCaptures = maybeExpectedCaptures { - let match = try XCTUnwrap(maybeMatch, file: file, line: line) + if let expectedCaptures = maybeExpectedCaptures, + let match = maybeMatch + { + if xfail { + XCTFail("Unexpectedly matched", file: file, line: line) + continue + } XCTAssertTrue( type(of: regex).RegexOutput.self == MatchType.self, """ @@ -39,7 +45,9 @@ class RegexDSLTests: XCTestCase { "'\(captures)' is not equal to the expected '\(expectedCaptures)'.", file: file, line: line) } else { - XCTAssertNil(maybeMatch, file: file, line: line) + if !xfail { + XCTAssertNil(maybeMatch, file: file, line: line) + } } } } @@ -525,6 +533,35 @@ class RegexDSLTests: XCTestCase { NegativeLookahead { "2" } CharacterClass.word } + + try _testDSLCaptures( + ("aaa", "aaa"), + ("\naaa", nil), + ("aaa\n", nil), + ("\naaa\n", nil), + matchType: Substring.self, ==) + { + Regex { + Anchor.startOfSubject + Repeat("a", count: 3) + Anchor.endOfSubject + }.anchorsMatchLineEndings() + } + + // FIXME: Anchor.start/endOfLine needs to always match line endings, + // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( + ("\naaa", "aaa"), + ("aaa\n", "aaa"), + ("\naaa\n", "aaa"), + matchType: Substring.self, ==, xfail: true) + { + Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + } + } } func testNestedGroups() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 35f8a9548..6bf7986ed 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -20,44 +20,17 @@ struct MatchError: Error { } } -extension Executor { - func _firstMatch( - _ regex: String, input: String, - syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false - ) throws -> (match: Substring, captures: [Substring?]) { - // TODO: This should be a CollectionMatcher API to call... - // Consumer -> searcher algorithm - var start = input.startIndex - while true { - if let result = try! self.dynamicMatch( - input, - in: start.. (String, [String?]) { - var executor = try _compileRegex(regex, syntax) - executor.engine.enableTracing = enableTracing - let (str, caps) = try executor._firstMatch( - regex, input: input, enableTracing: enableTracing) - let capStrs = caps.map { $0 == nil ? nil : String($0!) } - return (String(str), capStrs) + let regex = try Regex(regexStr, syntax: syntax) + guard let result = try regex.firstMatch(in: input) else { + throw MatchError("match not found for \(regexStr) in \(input)") + } + let caps = result.output.slices(from: input) + return (String(input[result.range]), caps.map { $0.map(String.init) }) } // TODO: multiple-capture variant @@ -66,7 +39,6 @@ func flatCaptureTest( _ regex: String, _ tests: (input: String, expect: [String?]?)..., syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, file: StaticString = #file, @@ -77,8 +49,7 @@ func flatCaptureTest( guard var (_, caps) = try? _firstMatch( regex, input: test, - syntax: syntax, - enableTracing: enableTracing + syntax: syntax ) else { if expect == nil { continue @@ -162,8 +133,7 @@ func firstMatchTest( let (found, _) = try _firstMatch( regex, input: input, - syntax: syntax, - enableTracing: enableTracing) + syntax: syntax) if xfail { XCTAssertNotEqual(found, match, file: file, line: line) @@ -947,7 +917,7 @@ extension RegexTests { #"\d{3}(?