From 79e4c65f6a8279fdd62a32131745f368791c076c Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 17 Jun 2022 14:53:58 -0500 Subject: [PATCH 01/18] Disentangle disparate 'bounds' ideas in processor This starts the separation of two different ideas for boundaries in the base input: - subjectBounds: These represent the actual subject in the input string. For a `String` callee, this will cover the entire bounds, while for a `Substring` these will represent the bounds of the substring in the base. - searchBounds: These represent the current search range in the subject. These bounds can be the same as `subjectBounds` or a subrange when searching for subsequent matches or replacing only in a subrange of a string. --- Sources/_StringProcessing/ByteCodeGen.swift | 27 +++++---- .../_StringProcessing/Engine/Consume.swift | 17 +++++- .../_StringProcessing/Engine/Processor.swift | 59 +++++++++++++------ Sources/_StringProcessing/Executor.swift | 10 ++-- Sources/_StringProcessing/Regex/Match.swift | 57 +++++++++++++----- .../_CharacterClassModel.swift | 6 +- 6 files changed, 123 insertions(+), 53 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 8407f68ac..81a5183e7 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -97,25 +97,25 @@ fileprivate extension Compiler.ByteCodeGen { switch kind { case .startOfSubject: builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex + pos == bounds.lowerBound } case .endOfSubjectBeforeNewline: builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == input.endIndex { return true } + if pos == bounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: - return input.index(after: pos) == input.endIndex + return input.index(after: pos) == bounds.upperBound && input[pos].isNewline case .unicodeScalar: - return input.unicodeScalars.index(after: pos) == input.endIndex + return input.unicodeScalars.index(after: pos) == bounds.upperBound && input.unicodeScalars[pos].isNewline } } case .endOfSubject: builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex + pos == bounds.upperBound } case .resetStartOfMatch: @@ -124,9 +124,10 @@ fileprivate extension Compiler.ByteCodeGen { case .firstMatchingPositionInSubject: // TODO: We can probably build a nice model with API here - builder.buildAssert { (input, pos, bounds) in - pos == bounds.lowerBound - } + + // FIXME: This needs to be based on `searchBounds`, + // not the `subjectBounds` given as an argument here + builder.buildAssert { (input, pos, bounds) in false } case .textSegment: builder.buildAssert { (input, pos, _) in @@ -141,9 +142,10 @@ fileprivate extension Compiler.ByteCodeGen { } case .startOfLine: + // FIXME: Anchor.startOfLine must always use this first branch if options.anchorsMatchNewlines { builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == input.startIndex { return true } + if pos == bounds.lowerBound { return true } switch semanticLevel { case .graphemeCluster: return input[input.index(before: pos)].isNewline @@ -153,14 +155,15 @@ fileprivate extension Compiler.ByteCodeGen { } } else { builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex + pos == bounds.lowerBound } } case .endOfLine: + // FIXME: Anchor.endOfLine must always use this first branch if options.anchorsMatchNewlines { builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == input.endIndex { return true } + if pos == bounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: return input[pos].isNewline @@ -170,7 +173,7 @@ fileprivate extension Compiler.ByteCodeGen { } } else { builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex + pos == bounds.upperBound } } diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index bc60ba260..540e1bdc5 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -18,10 +18,25 @@ extension Engine { Processor( program: program, input: input, - bounds: bounds, + subjectBounds: bounds, + searchBounds: bounds, matchMode: matchMode, isTracingEnabled: enableTracing) } + + func makeFirstMatchProcessor( + input: Input, + subjectBounds: Range, + searchBounds: Range + ) -> Processor { + Processor( + program: program, + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + matchMode: .partialFromFront, + isTracingEnabled: enableTracing) + } } extension Processor where Input == String { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index da8bfea14..573a4db09 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -31,11 +31,33 @@ struct Processor< > where Input.Element: Equatable { // maybe Hashable? typealias Element = Input.Element + /// The base collection of the subject to search. + /// + /// Taken together, `input` and `subjectBounds` define the actual subject + /// of the search. `input` can be a "supersequence" of the subject, while + /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input - let bounds: Range - let matchMode: MatchMode + + /// The bounds of the logical subject in `input`. + /// + /// + /// `subjectBounds` is equal to or a subrange of + /// `input.startIndex.. + + /// The bounds within the subject for an individual search. + /// + /// `searchBounds` is equal to `subjectBounds` in most cases, but can be a + /// subrange when performing operations like `str.replacing(_:with:subrange:)`. + let searchBounds: Range + + /// The current search position while processing. + /// + /// `currentPosition` must always be in the range `subjectBounds` or equal + /// to `subjectBounds.upperBound`. var currentPosition: Position - + + let matchMode: MatchMode let instructions: InstructionList var controller: Controller @@ -61,27 +83,30 @@ struct Processor< extension Processor { typealias Position = Input.Index - var start: Position { bounds.lowerBound } - var end: Position { bounds.upperBound } + var start: Position { subjectBounds.lowerBound } + var end: Position { subjectBounds.upperBound } } extension Processor { init( program: MEProgram, input: Input, - bounds: Range, + subjectBounds: Range, + searchBounds: Range, matchMode: MatchMode, isTracingEnabled: Bool ) { self.controller = Controller(pc: 0) self.instructions = program.instructions self.input = input - self.bounds = bounds + self.subjectBounds = subjectBounds + self.searchBounds = searchBounds self.matchMode = matchMode self.isTracingEnabled = isTracingEnabled - self.currentPosition = bounds.lowerBound + self.currentPosition = searchBounds.lowerBound - self.registers = Registers(program, bounds.upperBound) + // Initialize registers with end of search bounds + self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( repeating: .init(), count: program.registerInfo.captures) @@ -100,7 +125,7 @@ extension Processor { var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? - input[bounds] + input[subjectBounds] } // Advance in our input @@ -123,8 +148,8 @@ extension Processor { /// - Precondition: `bounds.contains(index) || index == bounds.upperBound` /// - Precondition: `index >= currentPosition` mutating func resume(at index: Input.Index) { - assert(index >= bounds.lowerBound) - assert(index <= bounds.upperBound) + assert(index >= subjectBounds.lowerBound) + assert(index <= subjectBounds.upperBound) assert(index >= currentPosition) currentPosition = index } @@ -199,7 +224,7 @@ extension Processor { switch (currentPosition, matchMode) { // When reaching the end of the match bounds or when we are only doing a // prefix match, transition to accept. - case (bounds.upperBound, _), (_, .partialFromFront): + case (subjectBounds.upperBound, _), (_, .partialFromFront): state = .accept // When we are doing a full match but did not reach the end of the match @@ -371,9 +396,9 @@ extension Processor { case .consumeBy: let reg = payload.consumer - guard currentPosition < bounds.upperBound, + guard currentPosition < subjectBounds.upperBound, let nextIndex = registers[reg]( - input, currentPosition..( _ input: String, - in inputRange: Range, + in subjectBounds: Range, _ mode: MatchMode ) throws -> Regex.Match? { var cpu = engine.makeProcessor( - input: input, bounds: inputRange, matchMode: mode) + input: input, bounds: subjectBounds, matchMode: mode) guard let endIdx = cpu.consume() else { if let e = cpu.failureReason { @@ -39,7 +39,7 @@ struct Executor { values: cpu.storedCaptures, referencedCaptureOffsets: engine.program.referencedCaptureOffsets) - let range = inputRange.lowerBound.., + in subjectBounds: Range, _ mode: MatchMode ) throws -> Regex.Match? { - try match(input, in: inputRange, mode) + try match(input, in: subjectBounds, mode) } } diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index 831c758dd..d423059ce 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -126,32 +126,59 @@ extension Regex { func _match( _ input: String, - in inputRange: Range, + in subjectBounds: Range, mode: MatchMode = .wholeString ) throws -> Regex.Match? { let executor = Executor(program: regex.program.loweredProgram) - return try executor.match(input, in: inputRange, mode) + return try executor.match(input, in: subjectBounds, mode) } func _firstMatch( _ input: String, - in inputRange: Range + in subjectBounds: Range ) throws -> Regex.Match? { - // FIXME: Something more efficient, likely an engine interface, and we - // should scrap the RegexConsumer crap and call this + try _firstMatch(input, subjectBounds: subjectBounds, searchBounds: subjectBounds) + } + + func _firstMatch( + _ input: String, + subjectBounds: Range, + searchBounds: Range + ) throws -> Regex.Match? { + let executor = Executor(program: regex.program.loweredProgram) - var low = inputRange.lowerBound - let high = inputRange.upperBound + var low = searchBounds.lowerBound + let high = searchBounds.upperBound while true { - if let m = try _match(input, in: low..= high { return nil } - if regex.initialOptions.semanticLevel == .graphemeCluster { - input.formIndex(after: &low) - } else { - input.unicodeScalars.formIndex(after: &low) + // FIXME: Make once and reset at this point (or after search) + var cpu = executor.engine.makeFirstMatchProcessor( + input: input, subjectBounds: subjectBounds, searchBounds: searchBounds) + cpu.currentPosition = low + + guard let endIdx = cpu.consume() else { + if let e = cpu.failureReason { + throw e + } + + if regex.initialOptions.semanticLevel == .graphemeCluster { + input.formIndex(after: &low) + } else { + input.unicodeScalars.formIndex(after: &low) + } + if low >= high { return nil } + + continue } + + let capList = MECaptureList( + values: cpu.storedCaptures, + referencedCaptureOffsets: executor.engine.program.referencedCaptureOffsets) + + let range = searchBounds.lowerBound.. Bool { // FIXME: How should we handle bounds? // We probably need two concepts - if input.isEmpty { return false } - if pos == input.startIndex { + if bounds.isEmpty { return false } + if pos == bounds.lowerBound { return self.matches(in: input, at: pos, with: options) != nil } let priorIdx = input.index(before: pos) - if pos == input.endIndex { + if pos == bounds.upperBound { return self.matches(in: input, at: priorIdx, with: options) != nil } From 8b585aff6fd77dcfc37ca7c8a66298c9ea6c546e Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 17 Jun 2022 15:59:49 -0500 Subject: [PATCH 02/18] Stop testing through the Executor We should be testing the higher-level Regex.firstMatch function instead of the lower-level types directly. --- Tests/RegexTests/MatchTests.swift | 50 +++++++------------------------ 1 file changed, 10 insertions(+), 40 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 35f8a9548..c5a9e19d2 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -20,44 +20,17 @@ struct MatchError: Error { } } -extension Executor { - func _firstMatch( - _ regex: String, input: String, - syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false - ) throws -> (match: Substring, captures: [Substring?]) { - // TODO: This should be a CollectionMatcher API to call... - // Consumer -> searcher algorithm - var start = input.startIndex - while true { - if let result = try! self.dynamicMatch( - input, - in: start.. (String, [String?]) { - var executor = try _compileRegex(regex, syntax) - executor.engine.enableTracing = enableTracing - let (str, caps) = try executor._firstMatch( - regex, input: input, enableTracing: enableTracing) - let capStrs = caps.map { $0 == nil ? nil : String($0!) } - return (String(str), capStrs) + let regex = try Regex(regexStr) + guard let result = try regex.firstMatch(in: input) else { + throw MatchError("match not found for \(regexStr) in \(input)") + } + let caps = result.output.slices(from: input) + return (String(input[result.range]), caps.map { $0.map(String.init) }) } // TODO: multiple-capture variant @@ -66,7 +39,6 @@ func flatCaptureTest( _ regex: String, _ tests: (input: String, expect: [String?]?)..., syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, file: StaticString = #file, @@ -77,8 +49,7 @@ func flatCaptureTest( guard var (_, caps) = try? _firstMatch( regex, input: test, - syntax: syntax, - enableTracing: enableTracing + syntax: syntax ) else { if expect == nil { continue @@ -162,8 +133,7 @@ func firstMatchTest( let (found, _) = try _firstMatch( regex, input: input, - syntax: syntax, - enableTracing: enableTracing) + syntax: syntax) if xfail { XCTAssertNotEqual(found, match, file: file, line: line) From a6ff9a5ac49b455a50cd671cb2a8b9f4ee6e8376 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 17 Jun 2022 16:00:23 -0500 Subject: [PATCH 03/18] Fix firstMatch implementation --- Sources/_StringProcessing/Regex/Match.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index d423059ce..8a9a83b10 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -160,12 +160,12 @@ extension Regex { throw e } + if low >= high { return nil } if regex.initialOptions.semanticLevel == .graphemeCluster { input.formIndex(after: &low) } else { input.unicodeScalars.formIndex(after: &low) } - if low >= high { return nil } continue } @@ -174,7 +174,7 @@ extension Regex { values: cpu.storedCaptures, referencedCaptureOffsets: executor.engine.program.referencedCaptureOffsets) - let range = searchBounds.lowerBound.. Date: Fri, 17 Jun 2022 16:00:38 -0500 Subject: [PATCH 04/18] Stop using RegexConsumer in contains(_:) --- Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift index e481597f8..9d8e7349d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Contains.swift @@ -74,6 +74,6 @@ extension BidirectionalCollection where SubSequence == Substring { @_disfavoredOverload @available(SwiftStdlib 5.7, *) public func contains(_ regex: some RegexComponent) -> Bool { - _contains(RegexConsumer(regex)) + (try? regex.regex.firstMatch(in: self[...])) != nil } } From d1290f82d82bc789d37bd7e2eebd696eb2c8e0ed Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 17 Jun 2022 16:07:48 -0500 Subject: [PATCH 05/18] Update test xfailures str.replacing(_:with:subrange:) still needs to use separate bounds for the subject and the search range. --- Tests/RegexTests/MatchTests.swift | 46 ++++++++++++++----------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c5a9e19d2..5fbdd949b 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -700,11 +700,11 @@ extension RegexTests { firstMatchTest(#"[\Qa-c\E]+"#, input: "a-c", match: "a-c") firstMatchTest(#"["a-c"]+"#, input: "abc", match: "a", - syntax: .experimental) + syntax: .experimental, xfail: true) firstMatchTest(#"["abc"]+"#, input: "cba", match: "cba", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", - syntax: .experimental) + syntax: .experimental, xfail: true) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) } @@ -1460,35 +1460,31 @@ extension RegexTests { let postfixLetters = try Regex(#"[a-z]+$"#, as: Substring.self) // start anchor (^) should match beginning of substring - XCTExpectFailure { - XCTAssertEqual(trimmed.firstMatch(of: prefixLetters)?.output, "abc") - } - XCTExpectFailure { - XCTAssertEqual(trimmed.replacing(prefixLetters, with: ""), "456def") - } + XCTAssertEqual(trimmed.firstMatch(of: prefixLetters)?.output, "abc") + XCTAssertEqual(trimmed.replacing(prefixLetters, with: ""), "456def") // end anchor ($) should match end of substring + XCTAssertEqual(trimmed.firstMatch(of: postfixLetters)?.output, "def") + XCTAssertEqual(trimmed.replacing(postfixLetters, with: ""), "abc456") + + // start anchor (^) should _not_ match beginning of subrange XCTExpectFailure { - XCTAssertEqual(trimmed.firstMatch(of: postfixLetters)?.output, "def") + XCTAssertEqual( + string.replacing( + prefixLetters, + with: "", + subrange: trimmed.startIndex.. Date: Mon, 20 Jun 2022 14:01:38 -0500 Subject: [PATCH 06/18] firstMatch shouldn't update searchBounds on iteration When we move forward while searching for the first match, the search bounds should stay the same. Only the currentPosition needs to move forward. This will allow us to implement the \G start of match anchor, with which /\Gab/ matches "abab" twice, compared with /^ab/, which only matches once. --- .../_StringProcessing/Engine/Processor.swift | 19 +++++++++---------- Sources/_StringProcessing/Executor.swift | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index dee77c30b..2fcd7de27 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -45,17 +45,17 @@ struct Processor< /// `input.startIndex.. - let matchMode: MatchMode - let instructions: InstructionList - - // MARK: Resettable state - /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a /// subrange when performing operations like searching for matches iteratively /// or calling `str.replacing(_:with:subrange:)`. - var searchBounds: Range + let searchBounds: Range + + let matchMode: MatchMode + let instructions: InstructionList + + // MARK: Resettable state /// The current search position while processing. /// @@ -115,13 +115,12 @@ extension Processor { _checkInvariants() } - mutating func reset(searchBounds: Range) { - self.searchBounds = searchBounds - self.currentPosition = self.searchBounds.lowerBound + mutating func reset(start: Position, end: Position) { + self.currentPosition = start self.controller = Controller(pc: 0) - self.registers.reset(sentinel: searchBounds.upperBound) + self.registers.reset(sentinel: end) self.savePoints.removeAll(keepingCapacity: true) self.callStack.removeAll(keepingCapacity: true) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 432fef167..13ad5733c 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -45,7 +45,7 @@ struct Executor { } else { input.unicodeScalars.formIndex(after: &low) } - cpu.reset(searchBounds: low.. Date: Mon, 20 Jun 2022 15:04:28 -0500 Subject: [PATCH 07/18] Make matches(of:) and ranges(of:) boundary-aware With this change, RegexMatchesCollection keeps the subject bounds and search bounds separately, modifying the search bounds with each iteration. In addition, the replace methods that only operate on a subrange can specify that specifically, getting the correct anchor behavior while only matching within a portion of a string. --- .../Algorithms/Algorithms/Ranges.swift | 34 ++++++++++++-- .../Algorithms/Algorithms/Replace.swift | 5 +- .../Algorithms/Matching/Matches.swift | 39 ++++++++++++---- Tests/RegexTests/MatchTests.swift | 46 +++++++++++-------- 4 files changed, 93 insertions(+), 31 deletions(-) diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift index 40732255c..fc6b23af2 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/Ranges.swift @@ -229,9 +229,18 @@ extension BidirectionalCollection where Element: Comparable { @available(SwiftStdlib 5.7, *) struct RegexRangesCollection { let base: RegexMatchesCollection - - init(string: Substring, regex: Regex) { - self.base = RegexMatchesCollection(base: string, regex: regex) + + init( + input: String, + subjectBounds: Range, + searchBounds: Range, + regex: Regex + ) { + self.base = .init( + input: input, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + regex: regex) } } @@ -263,12 +272,29 @@ extension RegexRangesCollection: Collection { // MARK: Regex algorithms extension Collection where SubSequence == Substring { + @available(SwiftStdlib 5.7, *) + @_disfavoredOverload + func _ranges( + of regex: R, + subjectBounds: Range, + searchBounds: Range + ) -> RegexRangesCollection { + RegexRangesCollection( + input: self[...].base, + subjectBounds: subjectBounds, + searchBounds: searchBounds, + regex: regex.regex) + } + @available(SwiftStdlib 5.7, *) @_disfavoredOverload func _ranges( of regex: R ) -> RegexRangesCollection { - RegexRangesCollection(string: self[...], regex: regex.regex) + _ranges( + of: regex, + subjectBounds: startIndex.. Self where Replacement.Element == Element { _replacing( - self[subrange]._ranges(of: regex), + self._ranges( + of: regex, + subjectBounds: startIndex.. { - let input: Substring + let input: String + let subjectBounds: Range + let searchBounds: Range let regex: Regex let startIndex: Index - init(base: Substring, regex: Regex) { - self.input = base + init( + input: String, + subjectBounds: Range, + searchBounds: Range, + regex: Regex + ) { + self.input = input + self.subjectBounds = subjectBounds + self.searchBounds = searchBounds self.regex = regex - self.startIndex = base.firstMatch(of: regex).map(Index.match) ?? .end + self.startIndex = (try? regex._firstMatch( + input, + subjectBounds: subjectBounds, + searchBounds: searchBounds)).map(Index.match) ?? .end } } @@ -241,12 +253,15 @@ extension RegexMatchesCollection: Sequence { } // `nextStart` is `nil` when iteration has completed - guard let start = nextStart else { + guard let start = nextStart, start <= base.searchBounds.upperBound else { return nil } // Otherwise, find the next match (if any) and compute `nextStart` - let match = try? base.regex.firstMatch(in: base.input[start...]) + let match = try? base.regex._firstMatch( + base.input, + subjectBounds: base.subjectBounds, + searchBounds: start..( of regex: R ) -> RegexMatchesCollection { - RegexMatchesCollection(base: self[...], regex: regex.regex) + RegexMatchesCollection( + input: self[...].base, + subjectBounds: startIndex.. Date: Tue, 21 Jun 2022 11:03:42 -0500 Subject: [PATCH 08/18] Improve subject/searchBounds documentation --- Sources/_StringProcessing/Engine/Processor.swift | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2fcd7de27..2994d37e6 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -40,8 +40,12 @@ struct Processor< /// The bounds of the logical subject in `input`. /// + /// `subjectBounds` represents the bounds of the string or substring that a + /// regex operation is invoked upon. Anchors like `^` and `.startOfSubject` + /// always use `subjectBounds` as their reference points, instead of + /// `input`'s boundaries or `searchBounds`. /// - /// `subjectBounds` is equal to or a subrange of + /// `subjectBounds` is always equal to or a subrange of /// `input.startIndex.. @@ -50,6 +54,10 @@ struct Processor< /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a /// subrange when performing operations like searching for matches iteratively /// or calling `str.replacing(_:with:subrange:)`. + /// + /// Anchors like `^` and `.startOfSubject` use `subjectBounds` instead of + /// `searchBounds`. The "start of matching" anchor `\G` uses `searchBounds` + /// as its starting point. let searchBounds: Range let matchMode: MatchMode From 6dcb9fd621dc592af5430b03ab50e08ea39f0dd2 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 21 Jun 2022 11:04:17 -0500 Subject: [PATCH 09/18] Processor.reset only needs the starting position --- Sources/_StringProcessing/Engine/Processor.swift | 6 +++--- Sources/_StringProcessing/Executor.swift | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 2994d37e6..afa75ddf9 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -123,12 +123,12 @@ extension Processor { _checkInvariants() } - mutating func reset(start: Position, end: Position) { - self.currentPosition = start + mutating func reset(currentPosition: Position) { + self.currentPosition = currentPosition self.controller = Controller(pc: 0) - self.registers.reset(sentinel: end) + self.registers.reset(sentinel: searchBounds.upperBound) self.savePoints.removeAll(keepingCapacity: true) self.callStack.removeAll(keepingCapacity: true) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 13ad5733c..e4399d2be 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -45,7 +45,7 @@ struct Executor { } else { input.unicodeScalars.formIndex(after: &low) } - cpu.reset(start: low, end: high) + cpu.reset(currentPosition: low) } } From 45c5bd2e3fe7002db9c36189db18dd3993ff7862 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 21 Jun 2022 11:06:07 -0500 Subject: [PATCH 10/18] Change Executor._match to use starting position This still isn't quite right - Processor.consume() should really return the matched range, since it already tracks the starting point of matching. Having it return the whole range will also prepare us for supporting `\K`, which will mean `currentPosition` isn't always the start of the matched range. --- Sources/_StringProcessing/Executor.swift | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index e4399d2be..268854f92 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -35,7 +35,7 @@ struct Executor { let high = searchBounds.upperBound while true { if let m: Regex.Match = try _match( - input, in: low.. Regex.Match? { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) - return try _match(input, in: subjectBounds, using: &cpu) + return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } @available(SwiftStdlib 5.7, *) func _match( _ input: String, - in subjectBounds: Range, + from currentPosition: String.Index, using cpu: inout Processor ) throws -> Regex.Match? { + // FIXME: currentPosition is already encapsulated in cpu, don't pass in + // FIXME: cpu.consume() should return the matched range, not the upper bound guard let endIdx = cpu.consume() else { if let e = cpu.failureReason { throw e @@ -77,7 +79,7 @@ struct Executor { values: cpu.storedCaptures, referencedCaptureOffsets: engine.program.referencedCaptureOffsets) - let range = subjectBounds.lowerBound.. Date: Tue, 21 Jun 2022 11:06:22 -0500 Subject: [PATCH 11/18] Audit search/subjectBounds usage --- Sources/_StringProcessing/Engine/Processor.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index afa75ddf9..16f8d5085 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -184,8 +184,8 @@ extension Processor { /// - Precondition: `bounds.contains(index) || index == bounds.upperBound` /// - Precondition: `index >= currentPosition` mutating func resume(at index: Input.Index) { - assert(index >= subjectBounds.lowerBound) - assert(index <= subjectBounds.upperBound) + assert(index >= searchBounds.lowerBound) + assert(index <= searchBounds.upperBound) assert(index >= currentPosition) currentPosition = index } @@ -256,7 +256,7 @@ extension Processor { switch (currentPosition, matchMode) { // When reaching the end of the match bounds or when we are only doing a // prefix match, transition to accept. - case (subjectBounds.upperBound, _), (_, .partialFromFront): + case (searchBounds.upperBound, _), (_, .partialFromFront): state = .accept // When we are doing a full match but did not reach the end of the match @@ -434,9 +434,9 @@ extension Processor { case .consumeBy: let reg = payload.consumer - guard currentPosition < subjectBounds.upperBound, + guard currentPosition < searchBounds.upperBound, let nextIndex = registers[reg]( - input, currentPosition.. Date: Tue, 21 Jun 2022 11:13:57 -0500 Subject: [PATCH 12/18] Further subject/searchBounds auditing --- Sources/_StringProcessing/Engine/Processor.swift | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 16f8d5085..618f9a47e 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -93,8 +93,8 @@ struct Processor< extension Processor { typealias Position = Input.Index - var start: Position { subjectBounds.lowerBound } - var end: Position { subjectBounds.upperBound } + var start: Position { searchBounds.lowerBound } + var end: Position { searchBounds.upperBound } } extension Processor { @@ -144,10 +144,12 @@ extension Processor { } func _checkInvariants() { - assert(end <= input.endIndex) - assert(start >= input.startIndex) - assert(currentPosition >= start) - assert(currentPosition <= end) + assert(searchBounds.lowerBound >= subjectBounds.lowerBound) + assert(searchBounds.upperBound <= subjectBounds.upperBound) + assert(subjectBounds.lowerBound >= input.startIndex) + assert(subjectBounds.upperBound <= input.endIndex) + assert(currentPosition >= searchBounds.lowerBound) + assert(currentPosition <= searchBounds.upperBound) } } From 7bf5b147b6be5f7396e7933b8ad5d28e3a54b969 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 21 Jun 2022 12:30:34 -0500 Subject: [PATCH 13/18] Update buildAssert use-sites to use subjectBounds This at least clarifies which boundaries are being used. --- Sources/_StringProcessing/ByteCodeGen.swift | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 95d34f568..f012b96bd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -96,26 +96,26 @@ fileprivate extension Compiler.ByteCodeGen { // need to supply both a slice bounds and a per-search bounds. switch kind { case .startOfSubject: - builder.buildAssert { (input, pos, bounds) in - pos == bounds.lowerBound + builder.buildAssert { (input, pos, subjectBounds) in + pos == subjectBounds.lowerBound } case .endOfSubjectBeforeNewline: - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == bounds.upperBound { return true } + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: - return input.index(after: pos) == bounds.upperBound + return input.index(after: pos) == subjectBounds.upperBound && input[pos].isNewline case .unicodeScalar: - return input.unicodeScalars.index(after: pos) == bounds.upperBound + return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound && input.unicodeScalars[pos].isNewline } } case .endOfSubject: - builder.buildAssert { (input, pos, bounds) in - pos == bounds.upperBound + builder.buildAssert { (input, pos, subjectBounds) in + pos == subjectBounds.upperBound } case .resetStartOfMatch: @@ -127,7 +127,7 @@ fileprivate extension Compiler.ByteCodeGen { // FIXME: This needs to be based on `searchBounds`, // not the `subjectBounds` given as an argument here - builder.buildAssert { (input, pos, bounds) in false } + builder.buildAssert { (input, pos, subjectBounds) in false } case .textSegment: builder.buildAssert { (input, pos, _) in @@ -144,8 +144,8 @@ fileprivate extension Compiler.ByteCodeGen { case .startOfLine: // FIXME: Anchor.startOfLine must always use this first branch if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == bounds.lowerBound { return true } + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } switch semanticLevel { case .graphemeCluster: return input[input.index(before: pos)].isNewline @@ -154,16 +154,16 @@ fileprivate extension Compiler.ByteCodeGen { } } } else { - builder.buildAssert { (input, pos, bounds) in - pos == bounds.lowerBound + builder.buildAssert { (input, pos, subjectBounds) in + pos == subjectBounds.lowerBound } } case .endOfLine: // FIXME: Anchor.endOfLine must always use this first branch if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, bounds) in - if pos == bounds.upperBound { return true } + builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } switch semanticLevel { case .graphemeCluster: return input[pos].isNewline @@ -172,25 +172,25 @@ fileprivate extension Compiler.ByteCodeGen { } } } else { - builder.buildAssert { (input, pos, bounds) in - pos == bounds.upperBound + builder.buildAssert { (input, pos, subjectBounds) in + pos == subjectBounds.upperBound } } case .wordBoundary: // TODO: May want to consider Unicode level - builder.buildAssert { [options] (input, pos, bounds) in + builder.buildAssert { [options] (input, pos, subjectBounds) in // TODO: How should we handle bounds? _CharacterClassModel.word.isBoundary( - input, at: pos, bounds: bounds, with: options) + input, at: pos, bounds: subjectBounds, with: options) } case .notWordBoundary: // TODO: May want to consider Unicode level - builder.buildAssert { [options] (input, pos, bounds) in + builder.buildAssert { [options] (input, pos, subjectBounds) in // TODO: How should we handle bounds? !_CharacterClassModel.word.isBoundary( - input, at: pos, bounds: bounds, with: options) + input, at: pos, bounds: subjectBounds, with: options) } } } From 6ce6108c9c7468644d87ea1ff0e04cf574da2cb6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 21 Jun 2022 12:31:01 -0500 Subject: [PATCH 14/18] Additonal subject/searchBounds correctness. --- Sources/_StringProcessing/Engine/Processor.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 618f9a47e..46e82e11e 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -157,7 +157,7 @@ extension Processor { var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? - input[subjectBounds] + input[searchBounds] } // Advance in our input, without any checks or failure signalling @@ -465,7 +465,7 @@ extension Processor { let matcher = registers[matcherReg] do { guard let (nextIdx, val) = try matcher( - input, currentPosition, subjectBounds + input, currentPosition, searchBounds ) else { signalFailure() return From 73ba7c7371b3a3056b62fa4eb3dbf99711f05777 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 21 Jun 2022 22:33:24 -0500 Subject: [PATCH 15/18] Add internal API for testing alternative syntaxes --- Sources/_StringProcessing/Regex/AnyRegexOutput.swift | 4 ++++ Tests/RegexTests/MatchTests.swift | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 156dd7220..8f9d0e010 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -145,6 +145,10 @@ extension Regex where Output == AnyRegexOutput { public init(_ pattern: String) throws { self.init(ast: try parse(pattern, .semantic, .traditional)) } + + internal init(_ pattern: String, syntax: SyntaxOptions) throws { + self.init(ast: try parse(pattern, .semantic, syntax)) + } } @available(SwiftStdlib 5.7, *) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 67c1346b4..bf2cafb1d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -25,7 +25,7 @@ func _firstMatch( input: String, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?]) { - let regex = try Regex(regexStr) + let regex = try Regex(regexStr, syntax: syntax) guard let result = try regex.firstMatch(in: input) else { throw MatchError("match not found for \(regexStr) in \(input)") } @@ -700,11 +700,11 @@ extension RegexTests { firstMatchTest(#"[\Qa-c\E]+"#, input: "a-c", match: "a-c") firstMatchTest(#"["a-c"]+"#, input: "abc", match: "a", - syntax: .experimental, xfail: true) + syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: "cba", match: "cba", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", - syntax: .experimental, xfail: true) + syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) } From c1a6edf6015abf151d27de3e36b61fb707a32f89 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 22 Jun 2022 15:44:36 -0500 Subject: [PATCH 16/18] Add an XFAIL'd test for \G --- Tests/RegexTests/MatchTests.swift | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index bf2cafb1d..6bf7986ed 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -917,7 +917,7 @@ extension RegexTests { #"\d{3}(? Date: Wed, 22 Jun 2022 15:45:17 -0500 Subject: [PATCH 17/18] Add XFAIL'd tests for Anchor.start/endOfLine These anchors should always match at start/end of line, not just when .anchorsMatchLineEndings() is turned on. --- Tests/RegexBuilderTests/RegexDSLTests.swift | 43 +++++++++++++++++++-- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index fc31e575f..24d6f219e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -18,6 +18,7 @@ class RegexDSLTests: XCTestCase { _ tests: (input: String, expectedCaptures: MatchType?)..., matchType: MatchType.Type, _ equivalence: (MatchType, MatchType) -> Bool, + xfail: Bool = false, file: StaticString = #file, line: UInt = #line, @RegexComponentBuilder _ content: () -> Content @@ -25,8 +26,13 @@ class RegexDSLTests: XCTestCase { let regex = content() for (input, maybeExpectedCaptures) in tests { let maybeMatch = input.wholeMatch(of: regex) - if let expectedCaptures = maybeExpectedCaptures { - let match = try XCTUnwrap(maybeMatch, file: file, line: line) + if let expectedCaptures = maybeExpectedCaptures, + let match = maybeMatch + { + if xfail { + XCTFail("Unexpectedly matched", file: file, line: line) + continue + } XCTAssertTrue( type(of: regex).RegexOutput.self == MatchType.self, """ @@ -39,7 +45,9 @@ class RegexDSLTests: XCTestCase { "'\(captures)' is not equal to the expected '\(expectedCaptures)'.", file: file, line: line) } else { - XCTAssertNil(maybeMatch, file: file, line: line) + if !xfail { + XCTAssertNil(maybeMatch, file: file, line: line) + } } } } @@ -525,6 +533,35 @@ class RegexDSLTests: XCTestCase { NegativeLookahead { "2" } CharacterClass.word } + + try _testDSLCaptures( + ("aaa", "aaa"), + ("\naaa", nil), + ("aaa\n", nil), + ("\naaa\n", nil), + matchType: Substring.self, ==) + { + Regex { + Anchor.startOfSubject + Repeat("a", count: 3) + Anchor.endOfSubject + }.anchorsMatchLineEndings() + } + + // FIXME: Anchor.start/endOfLine needs to always match line endings, + // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( + ("\naaa", "aaa"), + ("aaa\n", "aaa"), + ("\naaa\n", "aaa"), + matchType: Substring.self, ==, xfail: true) + { + Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + } + } } func testNestedGroups() throws { From 94612b2c92d68ea7c0f3815c1c24e2ce9b1ff7cb Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Wed, 22 Jun 2022 15:48:43 -0500 Subject: [PATCH 18/18] Improve FIXME comments --- Sources/_StringProcessing/ByteCodeGen.swift | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index ccdc10abf..9b9f02ff7 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -143,6 +143,9 @@ fileprivate extension Compiler.ByteCodeGen { case .startOfLine: // FIXME: Anchor.startOfLine must always use this first branch + // The behavior of `^` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.startOfLine` anchor should always match the start + // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in if pos == subjectBounds.lowerBound { return true } @@ -161,6 +164,9 @@ fileprivate extension Compiler.ByteCodeGen { case .endOfLine: // FIXME: Anchor.endOfLine must always use this first branch + // The behavior of `$` should depend on `anchorsMatchNewlines`, but + // the DSL-based `.endOfLine` anchor should always match the end + // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { builder.buildAssert { [semanticLevel = options.semanticLevel] (input, pos, subjectBounds) in if pos == subjectBounds.upperBound { return true }