From e0a73abeb7cf93f240805fecdbebee94c97754e1 Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Thu, 26 Dec 2024 17:54:31 -0600 Subject: [PATCH 1/8] Reapply changes on top of main --- Sources/RegexBuilder/Anchor.swift | 61 ++++ .../Regex/AST/MatchingOptions.swift | 3 + Sources/_RegexParser/Regex/Parse/Parse.swift | 13 + Sources/_RegexParser/Regex/Parse/Sema.swift | 8 +- Sources/_StringProcessing/ByteCodeGen.swift | 181 +++++++---- .../Engine/Instruction.swift | 57 +++- .../_StringProcessing/Engine/MEBuilder.swift | 89 ++++-- .../_StringProcessing/Engine/MEBuiltins.swift | 267 +++++++++++++++- .../Engine/MEReverseQuantify.swift | 177 +++++++++++ .../_StringProcessing/Engine/Processor.swift | 297 +++++++++++++++++- .../_StringProcessing/Engine/Tracing.swift | 29 ++ .../_StringProcessing/LiteralPrinter.swift | 3 + .../_StringProcessing/MatchingOptions.swift | 11 +- Sources/_StringProcessing/Regex/DSLTree.swift | 50 +-- Sources/_StringProcessing/Unicode/ASCII.swift | 90 ++++++ .../Utility/RegexFactory.swift | 20 +- .../RegexBuilderTests.swift | 29 ++ .../MatchingEngineTests.swift | 17 - Tests/RegexTests/CompileTests.swift | 21 ++ Tests/RegexTests/MatchTests.swift | 60 +++- 20 files changed, 1323 insertions(+), 160 deletions(-) create mode 100644 Sources/_StringProcessing/Engine/MEReverseQuantify.swift delete mode 100644 Tests/MatchingEngineTests/MatchingEngineTests.swift diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index ee3d5c2f8..48b2ce540 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -226,3 +226,64 @@ public struct NegativeLookahead: _BuiltinRegexComponent { self.init(_RegexFactory().negativeLookaheadNonCapturing(component())) } } + +/// A regex component that allows a match to continue only if its contents +/// match at the given location. +/// +/// A lookbehind is a zero-length assertion that its included regex matches at +/// a particular position. Lookbehinds do not advance the overall matching +/// position in the input string — once a lookbehind succeeds, matching continues +/// in the regex from the same position. +@available(SwiftStdlib 5.7, *) // TODO: How should this be gated? +public struct Lookbehind: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a lookbehind from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().lookbehindNonCapturing(component)) + } + + /// Creates a lookbehind from the regex generated by the given builder closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().lookbehindNonCapturing(component())) + } +} + +/// A regex component that allows a match to continue only if its contents +/// do not match at the given location. +/// +/// A negative lookbehind is a zero-length assertion that its included regex +/// does not match at a particular position. Lookbehinds do not advance the +/// overall matching position in the input string — once a lookbehind succeeds, +/// matching continues in the regex from the same position. +@available(SwiftStdlib 5.7, *) // TODO: How should this be gated? +public struct NegativeLookbehind: _BuiltinRegexComponent { + public var regex: Regex + + init(_ regex: Regex) { + self.regex = regex + } + + /// Creates a negative lookbehind from the given regex component. + public init( + _ component: R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().negativeLookbehindNonCapturing(component)) + } + + /// Creates a negative lookbehind from the regex generated by the given builder + /// closure. + public init( + @RegexComponentBuilder _ component: () -> R + ) where R.RegexOutput == Output { + self.init(_RegexFactory().negativeLookbehindNonCapturing(component())) + } +} diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index be288491d..41aca8504 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -47,6 +47,9 @@ extension AST { // NSRegularExpression compatibility special-case case nsreCompatibleDot // no AST representation + + // Lookbehind support + case reverse // no AST representation } public var kind: Kind diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 1fdadd8de..3ec852aa8 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -523,6 +523,19 @@ extension Parser { mutating func parseCustomCharacterClass( _ start: Source.Located ) -> CustomCC { + // Excessively nested recursion is a common DOS attack, so limit + // our recursion. + context.parseDepth += 1 + defer { context.parseDepth -= 1 } + guard context.parseDepth < context.maxParseDepth else { + self.errorAtCurrentPosition(.nestingTooDeep) + + // This is not generally recoverable and further errors will be + // incorrect + diags.suppressFurtherDiagnostics = true + return .init(start, [], start.location) + } + let alreadyInCCC = context.isInCustomCharacterClass context.isInCustomCharacterClass = true defer { context.isInCustomCharacterClass = alreadyInCCC } diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index d2f7c622d..1ae001101 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -143,7 +143,7 @@ extension RegexValidator { case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps, - .nsreCompatibleDot: + .nsreCompatibleDot, .reverse: break } } @@ -370,7 +370,7 @@ extension RegexValidator { } switch kind.value { case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, - .atomicNonCapturing: + .atomicNonCapturing, .lookbehind, .negativeLookbehind: break case .balancedCapture: @@ -384,8 +384,8 @@ extension RegexValidator { case .nonAtomicLookahead: error(.unsupported("non-atomic lookahead"), at: kind.location) - case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: - error(.unsupported("lookbehind"), at: kind.location) + case .nonAtomicLookbehind: + error(.unsupported("non-atomic lookbehind"), at: kind.location) case .scriptRun, .atomicScriptRun: error(.unsupported("script run"), at: kind.location) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 6a00a0dfd..885573662 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -16,6 +16,9 @@ internal import _RegexParser extension Compiler { struct ByteCodeGen { + var reverse: Bool { + options.reversed + } var options: MatchingOptions var builder = MEProgram.Builder() /// A Boolean indicating whether the first matchable atom has been emitted. @@ -143,7 +146,8 @@ fileprivate extension Compiler.ByteCodeGen { guard options.semanticLevel == .graphemeCluster else { for char in s { - for scalar in char.unicodeScalars { + let scalars: any Collection = reverse ? char.unicodeScalars.reversed() : char.unicodeScalars + for scalar in scalars { emitMatchScalar(scalar) } } @@ -152,20 +156,27 @@ fileprivate extension Compiler.ByteCodeGen { // Fast path for eliding boundary checks for an all ascii quoted literal if optimizationsEnabled && s.allSatisfy(\.isASCII) && !s.isEmpty { - let lastIdx = s.unicodeScalars.indices.last! - for idx in s.unicodeScalars.indices { - let boundaryCheck = idx == lastIdx + let boundaryIdx = reverse ? s.unicodeScalars.indices.first! : s.unicodeScalars.indices.last! + let indices: any Collection = reverse + ? s.unicodeScalars.indices.reversed() + : s.unicodeScalars.indices + + for idx in indices { + let boundaryCheck = idx == boundaryIdx let scalar = s.unicodeScalars[idx] if options.isCaseInsensitive && scalar.properties.isCased { - builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } else { - builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } } return } - for c in s { emitCharacter(c) } + let chars: any Collection = reverse ? s.reversed() : s + for char in chars { + emitCharacter(char) + } } mutating func emitBackreference( @@ -212,18 +223,18 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) { - builder.buildMatchBuiltin(model: cc.asRuntimeModel(options)) + builder.buildMatchBuiltin(model: cc.asRuntimeModel(options), reverse: reverse) } mutating func emitMatchScalar(_ s: UnicodeScalar) { assert(options.semanticLevel == .unicodeScalar) if options.isCaseInsensitive && s.properties.isCased { - builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false, reverse: reverse) } else { - builder.buildMatchScalar(s, boundaryCheck: false) + builder.buildMatchScalar(s, boundaryCheck: false, reverse: reverse) } } - + mutating func emitCharacter(_ c: Character) { // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { @@ -232,7 +243,7 @@ fileprivate extension Compiler.ByteCodeGen { } return } - + if options.isCaseInsensitive && c.isCased { if optimizationsEnabled && c.isASCII { // c.isCased ensures that c is not CR-LF, @@ -240,22 +251,25 @@ fileprivate extension Compiler.ByteCodeGen { assert(c.unicodeScalars.count == 1) builder.buildMatchScalarCaseInsensitive( c.unicodeScalars.last!, - boundaryCheck: true) + boundaryCheck: true, + reverse: reverse) } else { - builder.buildMatch(c, isCaseInsensitive: true) + builder.buildMatch(c, isCaseInsensitive: true, reverse: reverse) } return } - + if optimizationsEnabled && c.isASCII { let lastIdx = c.unicodeScalars.indices.last! for idx in c.unicodeScalars.indices { - builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + let scalar = c.unicodeScalars[idx] + let boundaryCheck = idx == lastIdx + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck, reverse: reverse) } return } - - builder.buildMatch(c, isCaseInsensitive: false) + + builder.buildMatch(c, isCaseInsensitive: false, reverse: reverse) } mutating func emitAny() { @@ -270,9 +284,9 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitAnyNonNewline() { switch options.semanticLevel { case .graphemeCluster: - builder.buildConsumeNonNewline() + builder.buildConsumeNonNewline(reverse: reverse) case .unicodeScalar: - builder.buildConsumeScalarNonNewline() + builder.buildConsumeScalarNonNewline(reverse: reverse) } } @@ -341,20 +355,42 @@ fileprivate extension Compiler.ByteCodeGen { // be glueing sub-grapheme components together? try emitNode(node) } + + mutating func emitLookaround( + _ kind: (forwards: Bool, positive: Bool), + _ child: DSLTree.Node + ) throws { + guard !child.containsCustomConsumer else { + throw Unsupported("Lookarounds with custom consumers") + } + + if !kind.forwards { + defer { options.endScope() } + options.beginScope() + // TODO: JH - Is it okay to use .fake here? + options.apply(.init(adding: [.init(.reverse, location: .fake)])) + } + + if kind.positive { + try emitPositiveLookaround(child) + } else { + try emitNegativeLookaround(child) + } + } - mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws { + mutating func emitPositiveLookaround(_ child: DSLTree.Node) throws { /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from + save(restoringAt: success) + save(restoringAt: intercept) + // failure restores at intercept + clearThrough(intercept) // remove intercept and any leftovers from fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ + intercept: + clearSavePoint // remove success + fail // propagate failure + success: + ... + */ let intercept = builder.makeAddress() let success = builder.makeAddress() @@ -370,8 +406,8 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(success) } - - mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws { + + mutating func emitNegativeLookaround(_ child: DSLTree.Node) throws { /* save(restoringAt: success) save(restoringAt: intercept) @@ -399,20 +435,6 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(success) } - - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ child: DSLTree.Node - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - if kind.positive { - try emitPositiveLookahead(child) - } else { - try emitNegativeLookahead(child) - } - } mutating func emitAtomicNoncapturingGroup( _ child: DSLTree.Node @@ -472,15 +494,14 @@ fileprivate extension Compiler.ByteCodeGen { options.beginScope() defer { options.endScope() } - if let lookaround = kind.lookaroundKind { - try emitLookaround(lookaround, child) - return - } - switch kind { case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: - throw Unreachable("TODO: reason") + guard let lookaround = kind.lookaroundKind else { + throw Unreachable("TODO: reason") + } + + try emitLookaround(lookaround, child) case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") @@ -491,7 +512,7 @@ fileprivate extension Compiler.ByteCodeGen { } options.apply(optionSequence) try emitNode(child) - + case .atomicNonCapturing: try emitAtomicNoncapturingGroup(child) @@ -768,7 +789,8 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = ccc.asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .atom(let atom): switch atom { @@ -778,24 +800,24 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) } else { // Uncased character OR case-sensitive matching; match as a single scalar ascii value character guard let val = c._singleScalarAsciiValue else { return false } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) } case .any: builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .anyNonNewline: builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .dot: builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) + matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics, reverse: reverse) case .characterClass(let cc): // Custom character class that consumes a single grapheme @@ -805,7 +827,9 @@ fileprivate extension Compiler.ByteCodeGen { kind, minTrips, maxExtraTrips, - isScalarSemantics: isScalarSemantics) + isScalarSemantics: isScalarSemantics, + reverse: reverse + ) default: return false } @@ -1119,9 +1143,9 @@ fileprivate extension Compiler.ByteCodeGen { if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { - builder.buildScalarMatchAsciiBitset(asciiBitset) + builder.buildScalarMatchAsciiBitset(asciiBitset, reverse: reverse) } else { - builder.buildMatchAsciiBitset(asciiBitset) + builder.buildMatchAsciiBitset(asciiBitset, reverse: reverse) } return } @@ -1203,7 +1227,7 @@ fileprivate extension Compiler.ByteCodeGen { return [node] } } - let children = children + var children = children .flatMap(flatten) .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in switch node { @@ -1222,6 +1246,9 @@ fileprivate extension Compiler.ByteCodeGen { return false } } + if reverse { + children.reverse() + } for child in children { try emitConcatenationComponent(child) } @@ -1230,7 +1257,6 @@ fileprivate extension Compiler.ByteCodeGen { @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { - case let .orderedChoice(children): try emitAlternation(children) @@ -1389,3 +1415,28 @@ extension DSLTree.CustomCharacterClass { return false } } + +extension DSLTree.Node { + var containsCustomConsumer: Bool { + switch self { + case .orderedChoice(let array), .concatenation(let array): + array.contains { $0.containsCustomConsumer } + case .capture(_, _, let node, _): + node.containsCustomConsumer + case .nonCapturingGroup(_, let node): + node.containsCustomConsumer + case .ignoreCapturesInTypedOutput(let node): + node.containsCustomConsumer + case .conditional(_, let node, let node2): + node.containsCustomConsumer || node2.containsCustomConsumer + case .quantification(_, _, let node): + node.containsCustomConsumer + case .convertedRegexLiteral(let node, _): + node.containsCustomConsumer + case .customCharacterClass, .atom, .trivia, .empty, .quotedLiteral, .absentFunction, .characterPredicate: + false + case .consumer, .matcher: + true + } + } +} diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 80bfd9b05..d3a3d5fad 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -94,6 +94,13 @@ extension Instruction { /// Operand: Amount to advance by. case advance + /// Reverse the input position. + /// + /// reverse(_ amount: Distance) + /// + /// Operand: Amount to reverse by. + case reverse + // TODO: Is the amount useful here? Is it commonly more than 1? /// Composite assert-advance else restore. @@ -105,6 +112,15 @@ extension Instruction { /// - Boolean for if we should match in a case insensitive way case match + /// Composite reverse-assert else restore. + /// + /// match(_: EltReg, isCaseInsensitive: Bool) + /// + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way + case reverseMatch + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) @@ -112,6 +128,12 @@ extension Instruction { /// Operands: Scalar value to match against and booleans case matchScalar + /// Reverse match against a scalar and possibly perform a boundary check or reverse match in a case insensitive way + /// + /// reverseMatchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) + /// + /// Operands: Scalar value to match against and booleans + case reverseMatchScalar /// Match directly (binary semantics) against a series of UTF-8 bytes /// /// NOTE: Compiler should ensure to only emit this instruction when normalization @@ -132,6 +154,15 @@ extension Instruction { /// - Boolean for if we should match by scalar value case matchBitset + /// Reverse match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// reverseMatchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value + case reverseMatchBitset + /// Match against a built-in character class /// /// matchBuiltin(_: CharacterClassPayload) @@ -141,11 +172,25 @@ extension Instruction { /// - If it is inverted /// - If it strictly matches only ascii values case matchBuiltin - + + /// Reverse match against a built-in character class + /// + /// reverseMatchBuiltin(_: CharacterClassPayload) + /// + /// Operand: the payload contains + /// - The character class + /// - If it is inverted + /// - If it strictly matches only ascii values + case reverseMatchBuiltin + /// Matches any non newline character /// Operand: If we are in scalar mode or not case matchAnyNonNewline + /// Reverse matches any non newline character + /// Operand: If we are in scalar mode or not + case reverseMatchAnyNonNewline + // MARK: Extension points /// Advance the input position based on the result by calling the consume @@ -212,7 +257,7 @@ extension Instruction { /// Fused save-and-branch. /// - /// split(to: target, saving: backtrackPoint) + /// split(to: target, saving: backtrackPoint) /// case splitSaving @@ -223,6 +268,13 @@ extension Instruction { /// quantify(_:QuantifyPayload) /// case quantify + /// Fused reverse quantify, execute, save instruction + /// Quantifies the stored instruction in an inner loop instead of looping through instructions in processor + /// Only quantifies specific nodes + /// + /// reverseQuantify(_:QuantifyPayload) + /// + case reverseQuantify /// Begin the given capture /// /// beginCapture(_:CapReg) @@ -266,7 +318,6 @@ extension Instruction { // TODO: Fused assertions. It seems like we often want to // branch based on assertion fail or success. - } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 1a26421eb..5efad688a 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -14,7 +14,7 @@ internal import _RegexParser // For errors extension MEProgram { struct Builder { var instructions: [Instruction] = [] - + // Tracing var enableTracing = false var enableMetrics = false @@ -179,23 +179,34 @@ extension MEProgram.Builder { mutating func buildAdvance(_ n: Distance) { instructions.append(.init(.advance, .init(distance: n))) } - + + mutating func buildReverse(_ n: Distance) { + instructions.append(.init(.reverse, .init(distance: n))) + } + + mutating func buildReverseUnicodeScalar(_ n: Distance) { + instructions.append(.init(.reverse, .init(distance: n, isScalarDistance: true))) + } + mutating func buildAdvanceUnicodeScalar(_ n: Distance) { instructions.append( .init(.advance, .init(distance: n, isScalarDistance: true))) } - - mutating func buildConsumeNonNewline() { - instructions.append(.init(.matchAnyNonNewline, .init(isScalar: false))) + + mutating func buildConsumeNonNewline(reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchAnyNonNewline : .matchAnyNonNewline + instructions.append(.init(opcode, .init(isScalar: false))) } - - mutating func buildConsumeScalarNonNewline() { - instructions.append(.init(.matchAnyNonNewline, .init(isScalar: true))) + + mutating func buildConsumeScalarNonNewline(reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchAnyNonNewline : .matchAnyNonNewline + instructions.append(.init(opcode, .init(isScalar: true))) } - mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatch : .match instructions.append(.init( - .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) + opcode, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } mutating func buildMatchUTF8(_ utf8: Array, boundaryCheck: Bool) { @@ -203,32 +214,38 @@ extension MEProgram.Builder { utf8: utf8Contents.store(utf8), boundaryCheck: boundaryCheck))) } - mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { - instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) - } - - mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { - instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchScalar : .matchScalar + instructions.append(.init(opcode, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) } + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchScalar : .matchScalar + instructions.append(.init(opcode, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) + } mutating func buildMatchAsciiBitset( - _ b: DSLTree.CustomCharacterClass.AsciiBitset + _ b: DSLTree.CustomCharacterClass.AsciiBitset, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBitset : .matchBitset instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + opcode, .init(bitset: makeAsciiBitset(b), isScalar: false))) } mutating func buildScalarMatchAsciiBitset( - _ b: DSLTree.CustomCharacterClass.AsciiBitset + _ b: DSLTree.CustomCharacterClass.AsciiBitset, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBitset : .matchBitset instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) + opcode, .init(bitset: makeAsciiBitset(b), isScalar: true))) } - - mutating func buildMatchBuiltin(model: _CharacterClassModel) { + + mutating func buildMatchBuiltin(model: _CharacterClassModel, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchBuiltin : .matchBuiltin instructions.append(.init( - .matchBuiltin, .init(model))) + opcode, .init(model))) } mutating func buildConsume( @@ -261,10 +278,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -273,10 +292,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -285,10 +306,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -297,10 +320,12 @@ extension MEProgram.Builder { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ maxExtraTrips: Int?, - isScalarSemantics: Bool + isScalarSemantics: Bool, + reverse: Bool ) { + let opcode = reverse ? Instruction.OpCode.reverseQuantify : .quantify instructions.append(.init( - .quantify, + opcode, .init(quantify: .init(model: model,kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics)))) } @@ -583,7 +608,7 @@ extension MEProgram.Builder { defer { asciiBitsets.append(b) } return AsciiBitsetRegister(asciiBitsets.count) } - + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index ab47a1a5f..5ac33fcdf 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -30,6 +30,27 @@ extension Processor { return true } + mutating func reverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> Bool { + guard currentPosition >= start, let previous = input.reverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } @@ -160,7 +181,49 @@ extension String { ? nil : (substr.first!, substr.endIndex) } - + + /// Returns the character at `pos`, bounded by `start`, as well as the lower + /// boundary of the returned character. + /// + /// This function handles loading a character from a string while respecting + /// an start boundary, even if that start boundary is sub-character or sub-scalar. + /// + /// - If `pos` is at or past `start`, this function returns `nil`. + /// - If `start` is between `pos` and the next grapheme cluster boundary (i.e., + /// `start` is before `self.index(after: pos)`, then the returned character + /// is smaller than the one that would be produced by `self[pos]` and the + /// returned index is at the start of that character. + /// - If `start` is between `pos` and the next grapheme cluster boundary, and + /// is not on a Unicode scalar boundary, the partial scalar is dropped. This + /// can result in a `nil` return or a character that includes only part of + /// the `self[pos]` character. + /// + /// - Parameters: + /// - pos: The position to load a character from. + /// - start: The limit for the character at `pos`. + /// - Returns: The character at `pos`, bounded by `start`, if it exists, along + /// with the lower bound of that character. The lower bound is always + /// scalar-aligned. + func characterAndStart( + at pos: String.Index, + limitedBy start: String.Index + ) -> (Character, characterStart: String.Index)? { + // FIXME: Sink into the stdlib to avoid multiple boundary calculations + guard pos > start else { return nil } + let previous = index(before: pos) + if previous >= start { + return (self[pos], previous) + } + + // `start` must be a sub-character position that is between `pos` and the + // next grapheme boundary. This is okay if `start` is on a Unicode scalar + // boundary, but if it's in the middle of a scalar's code units, there + // may not be a character to return at all after rounding down. Use + // `Substring`'s rounding to determine what we can return. + let substr = self[start.. String.Index? { + guard currentPosition >= start else { return nil } + if case .definite(let result) = _quickReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughReverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } + @inline(__always) private func _quickMatchAnyNonNewline( at currentPosition: String.Index, @@ -205,6 +291,27 @@ extension String { } } + @inline(__always) + private func _quickReverseMatchAnyNonNewline( + at currentPosition: String.Index, + limitedBy start: String.Index, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition >= start) + guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + at: currentPosition, limitedBy: start + ) else { + return .unknown + } + switch asciiValue { + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(previous) + } + } + @inline(never) private func _thoroughMatchAnyNonNewline( at currentPosition: String.Index, @@ -224,6 +331,25 @@ extension String { return next } + @inline(never) + private func _thoroughReverseMatchAnyNonNewline( + at currentPosition: String.Index, + limitedBy start: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + if isScalarSemantics { + guard currentPosition >= start else { return nil } + let scalar = unicodeScalars[currentPosition] + guard !scalar.isNewline else { return nil } + return unicodeScalars.index(before: currentPosition) + } + + guard let (char, previous) = characterAndStart(at: currentPosition, limitedBy: start), + !char.isNewline + else { return nil } + return previous + } + internal func matchRegexDot( at currentPosition: Index, limitedBy end: Index, @@ -282,6 +408,41 @@ extension String { isScalarSemantics: isScalarSemantics) } + func reverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition >= start else { return nil } + if case .definite(let result) = _quickReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughReverseMatchBuiltinCC( + cc, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) private func _quickMatchBuiltinCC( @@ -304,6 +465,27 @@ extension String { return .definite(result == isInverted ? nil : next) } + @inline(__always) + private func _quickReverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition >= start) + guard let (previous, result) = _quickReverseMatch( + cc, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + return .unknown + } + return .definite(result == isInverted ? nil : previous) + } + // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) private func _thoroughMatchBuiltinCC( @@ -386,4 +568,87 @@ extension String { } return next } + + @inline(never) + private func _thoroughReverseMatchBuiltinCC( + _ cc: _CharacterClassModel.Representation, + at currentPosition: String.Index, + limitedBy start: String.Index, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> String.Index? { + // TODO: Branch here on scalar semantics + // Don't want to pay character cost if unnecessary + guard let (char, previousIndex) = + characterAndStart(at: currentPosition, limitedBy: start) + else { return nil } + var previous = previousIndex + let scalar = unicodeScalars[currentPosition] + + let asciiCheck = !isStrictASCII + || (scalar.isASCII && isScalarSemantics) + || char.isASCII + + var matched: Bool + if isScalarSemantics && cc != .anyGrapheme { + previous = unicodeScalars.index(before: currentPosition) + } + + switch cc { + case .any, .anyGrapheme: + matched = true + case .digit: + if isScalarSemantics { + matched = scalar.properties.numericType != nil && asciiCheck + } else { + matched = char.isNumber && asciiCheck + } + case .horizontalWhitespace: + if isScalarSemantics { + matched = scalar.isHorizontalWhitespace && asciiCheck + } else { + matched = char._isHorizontalWhitespace && asciiCheck + } + case .verticalWhitespace: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + } else { + matched = char._isNewline && asciiCheck + } + case .newlineSequence: + if isScalarSemantics { + matched = scalar.isNewline && asciiCheck + if matched && scalar == "\r" + && previous >= start && unicodeScalars[previous] == "\n" { + // Match a full CR-LF sequence even in scalar semantics + unicodeScalars.formIndex(after: &previous) + } + } else { + matched = char._isNewline && asciiCheck + } + case .whitespace: + if isScalarSemantics { + matched = scalar.properties.isWhitespace && asciiCheck + } else { + matched = char.isWhitespace && asciiCheck + } + case .word: + if isScalarSemantics { + matched = scalar.properties.isAlphabetic && asciiCheck + } else { + matched = char.isWordCharacter && asciiCheck + } + } + + if isInverted { + matched.toggle() + } + + guard matched else { + return nil + } + + return previous + } } diff --git a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift new file mode 100644 index 000000000..5f1afb1bc --- /dev/null +++ b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift @@ -0,0 +1,177 @@ +extension Processor { + func _doReverseQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + let isScalarSemantics = payload.isScalarSemantics + + switch payload.type { + case .asciiBitset: + return input.reverseMatchASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + case .asciiChar: + return input.reverseMatchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: start, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + case .builtinCC: + guard currentPosition >= start else { return nil } + + // We only emit .quantify if it consumes a single character + return input.reverseMatchBuiltinCC( + payload.builtinCC, + at: currentPosition, + limitedBy: start, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + case .any: + guard currentPosition >= start else { return nil } + + if payload.anyMatchesNewline { + if isScalarSemantics { + return input.unicodeScalars.index(before: currentPosition) + } + return input.index(before: currentPosition) + } + + return input.reverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } + } + + /// Generic bounded reverseQuantify instruction interpreter + /// - Handles .eager and .posessive + /// - Handles arbitrary minTrips and maxExtraTrips + mutating func runReverseQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind != .reluctant) + + var trips = 0 + var maxExtraTrips = payload.maxExtraTrips + + while trips < payload.minTrips { + guard let previous = _doReverseQuantifyMatch(payload) else { + signalFailure() + return false + } + + currentPosition = previous + + // If we've reached the start of the string but still have more trips, fail + if currentPosition == start, trips < payload.minTrips { + signalFailure() + return false + } + + trips += 1 + } + + // If we don't have any more trips to take: + if maxExtraTrips == 0 { + // We're done + return true + } + + // We've already consumed the minimum number of characters, + // If we can't get another match, the reverse quantify was successful + guard let previous = _doReverseQuantifyMatch(payload) else { + return true + } + maxExtraTrips = maxExtraTrips.map { $0 - 1 } + + // Remember the range of valid positions in case we can create a quantified + // save point + var rangeStart = currentPosition + let rangeEnd = currentPosition + currentPosition = previous + + while true { + if maxExtraTrips == 0 { break } + + guard let previous = _doReverseQuantifyMatch(payload) else { + break + } + maxExtraTrips = maxExtraTrips.map({$0 - 1}) + rangeStart = currentPosition + currentPosition = previous + } + + if payload.quantKind == .eager { + savePoints.append(makeQuantifiedSavePoint( + rangeStart.. Bool { + assert(payload.quantKind == .eager + && payload.minTrips == 1 + && payload.maxExtraTrips == nil) + + // Match at least once + guard let previous = _doReverseQuantifyMatch(payload) else { + signalFailure() + return false + } + + // Run `a+` as `aa*` + currentPosition = previous + _doRunEagerZeroOrMoreReverseQuantify(payload) + return true + } + + /// Specialized quantify instruction interpreter for ? + mutating func runZeroOrOneReverseQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.minTrips == 0 + && payload.maxExtraTrips == 1) + let previous = _doReverseQuantifyMatch(payload) + guard let idx = previous else { + return true // matched zero times + } + if payload.quantKind != .possessive { + // Save the zero match + savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + } + currentPosition = idx + return true + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 0bf19b829..e6a93280d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,33 @@ extension Processor { return false } + // Reverse in our input + // + // Returns whether the advance succeeded. On failure, our + // save point was restored + mutating func reverseConsume(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage + if let idx = input.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) { + currentPosition = idx + return true + } + + // If `start` falls in the middle of a character, and we are trying to advance + // by one "character", then we should max out at `start` even though the above + // advancement will result in `nil`. + if n == 1, let idx = input.unicodeScalars.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) { + currentPosition = idx + return true + } + + signalFailure() + return false + } + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { // TODO: needs benchmark coverage @@ -232,6 +259,19 @@ extension Processor { return true } + // Reverses in unicode scalar view + mutating func reverseConsumeScalar(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage + guard let idx = input.unicodeScalars.index( + currentPosition, offsetBy: -n.rawValue, limitedBy: start + ) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + /// Continue matching at the specified index. /// /// - Precondition: `bounds.contains(index) || index == bounds.upperBound` @@ -279,6 +319,33 @@ extension Processor { return true } + // Reverse match against the current input element. Returns whether + // it succeeded vs signaling an error. + mutating func reverseMatch( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { + let previous = input.reverseMatch( + e, + at: currentPosition, + limitedBy: start, + isCaseInsensitive: isCaseInsensitive + ) + + guard let previous else { + guard currentPosition == start else { + // If there's no previous character, and we're not + // at the start of the string, the match has failed + signalFailure() + return false + } + + return true + } + + currentPosition = previous + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -318,6 +385,32 @@ extension Processor { return true } + mutating func reverseMatchScalar( + _ s: Unicode.Scalar, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Bool { + let previous = input.reverseMatchScalar( + s, + at: currentPosition, + limitedBy: start, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive + ) + + guard let previous else { + guard currentPosition == start else { + signalFailure() + return false + } + + return true + } + + currentPosition = previous + return true + } + // TODO: bytes should be a Span or RawSpan mutating func matchUTF8( _ bytes: Array, @@ -356,6 +449,26 @@ extension Processor { return true } + // If we have a bitset we know that the CharacterClass only matches against + // ascii characters, so check if the current input element is ascii then + // check if it is set in the bitset + mutating func reverseMatchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool + ) -> Bool { + guard let previous = input.reverseMatchASCIIBitset( + bitset, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + // Matches the next character/scalar if it is not a newline mutating func matchAnyNonNewline( isScalarSemantics: Bool @@ -372,6 +485,22 @@ extension Processor { return true } + // Matches the previous character/scalar if it is not a newline + mutating func reverseMatchAnyNonNewline( + isScalarSemantics: Bool + ) -> Bool { + guard let previous = input.reverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + mutating func signalFailure(preservingCaptures: Bool = false) { guard !savePoints.isEmpty else { state = .fail @@ -535,16 +664,35 @@ extension Processor { controller.step() } } + case .reverse: + let (isScalar, distance) = payload.distance + if isScalar { + if reverseConsumeScalar(distance) { + controller.step() + } + } else { + if reverseConsume(distance) { + controller.step() + } + } case .matchAnyNonNewline: if matchAnyNonNewline(isScalarSemantics: payload.isScalar) { controller.step() } + case .reverseMatchAnyNonNewline: + if reverseMatchAnyNonNewline(isScalarSemantics: payload.isScalar) { + controller.step() + } case .match: let (isCaseInsensitive, reg) = payload.elementPayload if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { controller.step() } - + case .reverseMatch: + let (isCaseInsensitive, reg) = payload.elementPayload + if reverseMatch(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() + } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload if matchScalar( @@ -554,6 +702,15 @@ extension Processor { ) { controller.step() } + case .reverseMatchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if reverseMatchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() + } case .matchUTF8: let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload @@ -570,7 +727,12 @@ extension Processor { if matchBitset(bitset, isScalarSemantics: isScalar) { controller.step() } - + case .reverseMatchBitset: + let (isScalar, reg) = payload.bitsetPayload + let bitset = registers[reg] + if reverseMatchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() + } case .matchBuiltin: let payload = payload.characterClassPayload if matchBuiltinCC( @@ -581,10 +743,40 @@ extension Processor { ) { controller.step() } + case .reverseMatchBuiltin: + let payload = payload.characterClassPayload + if reverseMatchBuiltinCC( + payload.cc, + isInverted: payload.isInverted, + isStrictASCII: payload.isStrictASCII, + isScalarSemantics: payload.isScalarSemantics + ) { + controller.step() + } case .quantify: if runQuantify(payload.quantify) { controller.step() } + case .reverseQuantify: + let quantPayload = payload.quantify + let matched: Bool + switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.maxExtraTrips) { + case (.reluctant, _, _): + assertionFailure(".reluctant is not supported by .quantify") + return + case (.eager, 0, nil): + runEagerZeroOrMoreReverseQuantify(quantPayload) + matched = true + case (.eager, 1, nil): + matched = runEagerOneOrMoreReverseQuantify(quantPayload) + case (_, 0, 1): + matched = runZeroOrOneReverseQuantify(quantPayload) + default: + matched = runReverseQuantify(quantPayload) + } + if matched { + controller.step() + } case .consumeBy: let reg = payload.consumer @@ -715,6 +907,25 @@ extension String { return next } + func reverseMatch( + _ char: Character, + at pos: Index, + limitedBy start: String.Index, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + guard let (stringChar, next) = characterAndStart(at: pos, limitedBy: start) else { return nil } + + if isCaseInsensitive { + guard stringChar.lowercased() == char.lowercased() else { return nil } + } else { + guard stringChar == char else { return nil } + } + + return next + } + func matchSeq( _ seq: Substring, at pos: Index, @@ -774,6 +985,38 @@ extension String { return idx } + func reverseMatchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy start: String.Index, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + guard pos >= start else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } + } + + guard pos != start else { return pos } + let idx = unicodeScalars.index(before: pos) + assert(idx >= start, "Input is a substring with a sub-scalar startIndex.") + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + func matchUTF8( _ bytes: Array, at pos: Index, @@ -844,4 +1087,54 @@ extension String { return next } + + func reverseMatchASCIIBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy start: Index, + isScalarSemantics: Bool + ) -> Index? { + + // FIXME: Inversion should be tracked and handled in only one place. + // That is, we should probably store it as a bit in the instruction, so that + // bitset matching and bitset inversion is bit-based rather that semantically + // inverting the notion of a match or not. As-is, we need to track both + // meanings in some code paths. + let isInverted = bitset.isInverted + + // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment + // there + guard let (asciiByte, previous, isCRLF) = _quickReverseASCIICharacter( + at: pos, + limitedBy: start + ) else { + if isScalarSemantics { + guard pos >= start else { return nil } + guard bitset.matches(unicodeScalars[pos]) else { return nil } + return unicodeScalars.index(before: pos) + } else { + guard let (char, previous) = characterAndStart(at: pos, limitedBy: start), + bitset.matches(char) else { return nil } + return previous + } + } + + guard bitset.matches(asciiByte) else { + // FIXME: check inversion here after refactored out of bitset + return nil + } + + // CR-LF should only match `[\r]` in scalar semantic mode or if inverted + if isCRLF { + if isScalarSemantics { + return self.unicodeScalars.index(after: previous) + } + if isInverted { + return previous + } + return nil + } + + return previous + } } diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index b67cbb6a5..e43b79264 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -65,6 +65,13 @@ extension Instruction: CustomStringConvertible { } else { return "match char[\(reg)]" } + case .reverseMatch: + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + return "reverseMatchCaseInsensitive char[\(reg)]" + } else { + return "reverseMatch char[\(reg)]" + } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload if isScalar { @@ -72,9 +79,19 @@ extension Instruction: CustomStringConvertible { } else { return "matchBitset bitset[\(reg)]" } + case .reverseMatchBitset: + let (isScalar, reg) = payload.bitsetPayload + if isScalar { + return "reverseMatchBitsetScalar bitset[\(reg)]" + } else { + return "reverseMatchBitset bitset[\(reg)]" + } case .matchBuiltin: let payload = payload.characterClassPayload return "matchBuiltin \(payload.cc) (\(payload.isInverted))" + case .reverseMatchBuiltin: + let payload = payload.characterClassPayload + return "\(opcode) \(payload.cc) (\(payload.isInverted))" case .matchBy: let (matcherReg, valReg) = payload.pairedMatcherValue return "\(opcode) match[\(matcherReg)] -> val[\(valReg)]" @@ -85,6 +102,13 @@ extension Instruction: CustomStringConvertible { } else { return "matchScalar '\(scalar)' boundaryCheck: \(boundaryCheck)" } + case .reverseMatchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + return "reverseMatchScalarCaseInsensitive '\(scalar)' boundaryCheck: \(boundaryCheck)" + } else { + return "reverseMatchScalar '\(scalar)' boundaryCheck: \(boundaryCheck)" + } case .moveCurrentPosition: let reg = payload.position return "\(opcode) -> pos[\(reg)]" @@ -94,6 +118,9 @@ extension Instruction: CustomStringConvertible { case .quantify: let payload = payload.quantify return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.maxExtraTrips?.description ?? "unbounded" )" + case .reverseQuantify: + let payload = payload.quantify + return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.maxExtraTrips?.description ?? "unbounded" )" case .save: let resumeAddr = payload.addr return "\(opcode) \(resumeAddr)" @@ -106,6 +133,8 @@ extension Instruction: CustomStringConvertible { case .transformCapture: let (cap, trans) = payload.pairedCaptureTransform return "\(opcode) trans[\(trans)](\(cap))" + case .reverse: + return "\(opcode) \(payload.distance)" default: return "\(opcode)" } diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index 5c136827c..afae5b6fb 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -575,6 +575,9 @@ extension AST.MatchingOption.Kind { // NSRE Compatibility option; no literal representation case .nsreCompatibleDot: return nil + // Reverse option for lookbehinds; no literal representation + case .reverse: return nil + #if RESILIENT_LIBRARIES @unknown default: fatalError() diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 793c6c82d..226451870 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -133,10 +133,14 @@ extension MatchingOptions { var usesCanonicalEquivalence: Bool { semanticLevel == .graphemeCluster } - + var usesNSRECompatibleDot: Bool { stack.last!.contains(.nsreCompatibleDot) } + + var reversed: Bool { + stack.last!.contains(.reverse) + } } // MARK: - Implementation @@ -160,6 +164,9 @@ extension MatchingOptions { case withoutAnchoringBounds case nsreCompatibleDot + // Not available via regex literal flags + case reverse + // Oniguruma options case asciiOnlyDigit case asciiOnlyPOSIXProps @@ -225,6 +232,8 @@ extension MatchingOptions { self = .extended case .extraExtended: self = .extraExtended + case .reverse: + self = .reverse #if RESILIENT_LIBRARIES @unknown default: fatalError() diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 8d6a5fbc7..6b20d5e17 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -105,7 +105,7 @@ extension DSLTree { case explicit(_AST.QuantificationKind) /// A kind set via syntax, which can be affected by options. case syntax(_AST.QuantificationKind) - + var ast: AST.Quantification.Kind? { switch self { case .default: return nil @@ -114,12 +114,12 @@ extension DSLTree { } } } - + @_spi(RegexBuilder) public struct CustomCharacterClass { var members: [Member] var isInverted: Bool - + var containsDot: Bool { members.contains { member in switch member { @@ -152,13 +152,13 @@ extension DSLTree { self.members = members self.isInverted = isInverted } - + public static func generalCategory(_ category: Unicode.GeneralCategory) -> Self { let property = AST.Atom.CharacterProperty(.generalCategory(category.extendedGeneralCategory!), isInverted: false, isPOSIX: false) let astAtom = AST.Atom(.property(property), .fake) return .init(members: [.atom(.unconverted(.init(ast: astAtom)))]) } - + public var inverted: CustomCharacterClass { var result = self result.isInverted.toggle() @@ -263,7 +263,7 @@ extension DSLTree.Atom { /// \B case notWordBoundary } - + @_spi(RegexBuilder) public enum CharacterClass: Hashable { case digit @@ -396,7 +396,7 @@ extension DSLTree.Node { @_spi(RegexBuilder) public var children: [DSLTree.Node] { switch self { - + case let .orderedChoice(v): return v case let .concatenation(v): return v @@ -504,12 +504,12 @@ public struct ReferenceID: Hashable { public var _raw: Int { base } - + public init() { base = Self.counter Self.counter += 1 } - + init(_ base: Int) { self.base = base } @@ -854,14 +854,14 @@ extension DSLTree { /// `_TreeNode` conformance. struct _Tree: _TreeNode { var node: DSLTree.Node - + init(_ node: DSLTree.Node) { self.node = node } - + var children: [_Tree]? { switch node { - + case let .orderedChoice(v): return v.map(_Tree.init) case let .concatenation(v): return v.map(_Tree.init) @@ -892,7 +892,7 @@ extension DSLTree { @_spi(RegexBuilder) public struct GroupKind { internal var ast: AST.Group.Kind - + public static var atomicNonCapturing: Self { .init(ast: .atomicNonCapturing) } @@ -902,17 +902,23 @@ extension DSLTree { public static var negativeLookahead: Self { .init(ast: .negativeLookahead) } + public static var lookbehind: Self { + .init(ast: .lookbehind) + } + public static var negativeLookbehind: Self { + .init(ast: .negativeLookbehind) + } } @_spi(RegexBuilder) public struct ConditionKind { internal var ast: AST.Conditional.Condition.Kind } - + @_spi(RegexBuilder) public struct QuantificationKind { internal var ast: AST.Quantification.Kind - + public static var eager: Self { .init(ast: .eager) } @@ -923,11 +929,11 @@ extension DSLTree { .init(ast: .possessive) } } - + @_spi(RegexBuilder) public struct QuantificationAmount { internal var ast: AST.Quantification.Amount - + public static var zeroOrMore: Self { .init(ast: .zeroOrMore) } @@ -965,27 +971,27 @@ extension DSLTree { } } } - + @_spi(RegexBuilder) public struct ASTNode { internal var ast: AST.Node } - + @_spi(RegexBuilder) public struct AbsentFunction { internal var ast: AST.AbsentFunction } - + @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference } - + @_spi(RegexBuilder) public struct MatchingOptionSequence { internal var ast: AST.MatchingOptionSequence } - + public struct Atom { internal var ast: AST.Atom } diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 53dfe652d..84e41572a 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -122,6 +122,49 @@ extension String { return (first: base, next: next, crLF: false) } + /// TODO: better to take isScalarSemantics parameter, we can return more results + /// and we can give the right `previous` index, not requiring the caller to re-adjust it + /// TODO: detailed description of nuanced semantics + func _quickReverseASCIICharacter( + at idx: Index, + limitedBy start: Index + ) -> (char: UInt8, previous: Index, crLF: Bool)? { + // TODO: fastUTF8 version + assert(String.Index(idx, within: unicodeScalars) != nil) + assert(idx >= start) + + // Exit if we're at our limit + if idx == start { + return nil + } + + let char = utf8[idx] + guard char._isASCII else { + assert(!self[idx].isASCII) + return nil + } + + var previous = utf8.index(before: idx) + if previous == start { + return (char: char, previous: previous, crLF: false) + } + + let head = utf8[previous] + guard head._isSub300StartingByte else { return nil } + + // Handle CR-LF: + if char == ._carriageReturn && head == ._lineFeed { + utf8.formIndex(before: &previous) + guard previous == start || utf8[previous]._isSub300StartingByte else { + return nil + } + return (char: char, previous: previous, crLF: true) + } + + assert(self[idx].isASCII && self[idx] != "\r\n") + return (char: char, previous: previous, crLF: false) + } + func _quickMatch( _ cc: _CharacterClassModel.Representation, at idx: Index, @@ -169,5 +212,52 @@ extension String { } } + func _quickReverseMatch( + _ cc: _CharacterClassModel.Representation, + at idx: Index, + limitedBy start: Index, + isScalarSemantics: Bool + ) -> (previous: Index, matchResult: Bool)? { + /// ASCII fast-paths + guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + at: idx, limitedBy: start + ) else { + return nil + } + + // TODO: bitvectors + switch cc { + case .any, .anyGrapheme: + return (previous, true) + + case .digit: + return (previous, asciiValue._asciiIsDigit) + + case .horizontalWhitespace: + return (previous, asciiValue._asciiIsHorizontalWhitespace) + + case .verticalWhitespace, .newlineSequence: + if asciiValue._asciiIsVerticalWhitespace { + if isScalarSemantics && isCRLF && cc == .verticalWhitespace { + return (utf8.index(after: previous), true) + } + return (previous, true) + } + return (previous, false) + + case .whitespace: + if asciiValue._asciiIsWhitespace { + if isScalarSemantics && isCRLF { + return (utf8.index(after: previous), true) + } + return (previous, true) + } + return (previous, false) + + case .word: + return (previous, asciiValue._asciiIsWord) + } + } + } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 0c224e159..3cce8a80b 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -167,7 +167,25 @@ public struct _RegexFactory { ) -> Regex { .init(node: .nonCapturingGroup(.negativeLookahead, component.regex.root)) } - + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func lookbehindNonCapturing( + _ component: some RegexComponent + ) -> Regex { + // TODO: Compiler error if component contains a custom consumer? + .init(node: .nonCapturingGroup(.lookbehind, component.regex.root)) + } + + @_spi(RegexBuilder) + @available(SwiftStdlib 5.7, *) + public func negativeLookbehindNonCapturing( + _ component: some RegexComponent + ) -> Regex { + // TODO: Compiler error if component contains a custom consumer? + .init(node: .nonCapturingGroup(.negativeLookbehind, component.regex.root)) + } + @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent diff --git a/Tests/DocumentationTests/RegexBuilderTests.swift b/Tests/DocumentationTests/RegexBuilderTests.swift index d0ae36e01..e9535cdf3 100644 --- a/Tests/DocumentationTests/RegexBuilderTests.swift +++ b/Tests/DocumentationTests/RegexBuilderTests.swift @@ -205,3 +205,32 @@ extension RegexBuilderTests { XCTAssertEqual(matches[0].1, 121.54) } } + +@available(SwiftStdlib 5.10, *) +extension RegexBuilderTests { + func testPositiveLookbehind() throws { + let regex = Regex { + Lookbehind { "foo" } + "bar" + } + + let matching = try regex.firstMatch(in: "foobar")?.output // == "bar" + let nonMatching = try regex.firstMatch(in: "fuubar")?.output // == nil + + try XCTAssertEqual(XCTUnwrap(matching), "bar") + XCTAssertNil(nonMatching) + } + + func testNegativeLookbehind() throws { + let regex = Regex { + NegativeLookbehind { "buzz" } + "baz" + } + + let matching = try regex.firstMatch(in: "foobaz")?.output // == "baz" + let nonMatching = try regex.firstMatch(in: "buzzbaz")?.output // == nil + + try XCTAssertEqual(XCTUnwrap(matching), "baz") + XCTAssertNil(nonMatching) + } +} diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift deleted file mode 100644 index ccfe85ec7..000000000 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ /dev/null @@ -1,17 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -import XCTest - -@testable import _StringProcessing - -// TODO: Unit tests for the engine itself. Functional testing -// is handled by regex tests. diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 7ea38490a..6ea7da996 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -51,6 +51,13 @@ enum DecodedInstr { case transformCapture case captureValue case quantify + case reverse + case reverseMatch + case reverseMatchScalar + case reverseMatchBitset + case reverseMatchBuiltin + case reverseMatchAnyNonNewline + case reverseQuantify } extension DecodedInstr { @@ -142,6 +149,20 @@ extension DecodedInstr { return .captureValue case .matchBuiltin: return .matchBuiltin + case .reverse: + return .reverse + case .reverseMatch: + return .reverseMatch + case .reverseMatchScalar: + return .reverseMatchScalar + case .reverseMatchBitset: + return .reverseMatchBitset + case .reverseMatchBuiltin: + return .reverseMatchBuiltin + case .reverseMatchAnyNonNewline: + return .reverseMatchAnyNonNewline + case .reverseQuantify: + return .reverseQuantify case .matchUTF8: return .matchUTF8 } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index c52560d66..3c8072e92 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -289,8 +289,8 @@ func firstMatchTest( input: String, match: String?, syntax: SyntaxOptions = .traditional, - enableTracing: Bool = false, - dumpAST: Bool = false, + enableTracing: Bool = true, + dumpAST: Bool = true, xfail: Bool = false, validateOptimizations: Bool = true, semanticLevel: RegexSemanticLevel = .graphemeCluster, @@ -325,6 +325,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + validateOptimizations: Bool = true, semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line @@ -338,6 +339,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, file: file, line: line) @@ -1601,28 +1603,62 @@ extension RegexTests { (input: "hzello", match: "e"), (input: "hezllo", match: nil), (input: "helloz", match: nil)) + } + func testLookbehinds() { firstMatchTest( - #"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) + #"(?<=USD)\d+"#, input: "Price: USD100", match: "100") firstMatchTest( - #"(*plb:USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) + #"(*plb:USD)\d+"#, input: "Price: USD100", match: "100") firstMatchTest( #"(*positive_lookbehind:USD)\d+"#, - input: "Price: USD100", match: "100", xfail: true) - // engines generally enforce that lookbehinds are fixed width + input: "Price: USD100", match: "100") + firstMatchTest( - #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100", xfail: true) + #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100") firstMatchTest( - #"(? Date: Thu, 26 Dec 2024 18:15:39 -0600 Subject: [PATCH 2/8] Squash some bugs --- .../_StringProcessing/Engine/MEBuiltins.swift | 6 +- .../_StringProcessing/Engine/Processor.swift | 4 +- Sources/_StringProcessing/Unicode/ASCII.swift | 18 +- Tests/MatchingEngineTests/ASCIITests.swift | 153 +++++++++++ .../MatchingEngineTests.swift | 251 ++++++++++++++++++ 5 files changed, 418 insertions(+), 14 deletions(-) create mode 100644 Tests/MatchingEngineTests/ASCIITests.swift create mode 100644 Tests/MatchingEngineTests/MatchingEngineTests.swift diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 5ac33fcdf..d6d77e749 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -252,7 +252,7 @@ extension String { limitedBy start: String.Index, isScalarSemantics: Bool ) -> String.Index? { - guard currentPosition >= start else { return nil } + guard currentPosition > start else { return nil } if case .definite(let result) = _quickReverseMatchAnyNonNewline( at: currentPosition, limitedBy: start, @@ -297,7 +297,7 @@ extension String { limitedBy start: String.Index, isScalarSemantics: Bool ) -> QuickResult { - assert(currentPosition >= start) + assert(currentPosition > start) guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( at: currentPosition, limitedBy: start ) else { @@ -338,7 +338,7 @@ extension String { isScalarSemantics: Bool ) -> String.Index? { if isScalarSemantics { - guard currentPosition >= start else { return nil } + guard currentPosition > start else { return nil } let scalar = unicodeScalars[currentPosition] guard !scalar.isNewline else { return nil } return unicodeScalars.index(before: currentPosition) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index e6a93280d..e5c2b54c6 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -221,7 +221,7 @@ extension Processor { // Reverse in our input // - // Returns whether the advance succeeded. On failure, our + // Returns whether the reverse succeeded. On failure, our // save point was restored mutating func reverseConsume(_ n: Distance) -> Bool { // TODO: needs benchmark coverage @@ -234,7 +234,7 @@ extension Processor { // If `start` falls in the middle of a character, and we are trying to advance // by one "character", then we should max out at `start` even though the above - // advancement will result in `nil`. + // reversal will result in `nil`. if n == 1, let idx = input.unicodeScalars.index( currentPosition, offsetBy: -n.rawValue, limitedBy: start ) { diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 84e41572a..26cbff4c3 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -109,7 +109,7 @@ extension String { let tail = utf8[next] guard tail._isSub300StartingByte else { return nil } - // Handle CR-LF: + // Handle CR-LF by advancing past the sequence if both characters are present if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) guard next == end || utf8[next]._isSub300StartingByte else { @@ -123,17 +123,17 @@ extension String { } /// TODO: better to take isScalarSemantics parameter, we can return more results - /// and we can give the right `previous` index, not requiring the caller to re-adjust it + /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics func _quickReverseASCIICharacter( at idx: Index, limitedBy start: Index - ) -> (char: UInt8, previous: Index, crLF: Bool)? { + ) -> (first: UInt8, previous: Index, crLF: Bool)? { // TODO: fastUTF8 version assert(String.Index(idx, within: unicodeScalars) != nil) assert(idx >= start) - // Exit if we're at our limit + // If we're already at the start, there is no previous character if idx == start { return nil } @@ -146,23 +146,23 @@ extension String { var previous = utf8.index(before: idx) if previous == start { - return (char: char, previous: previous, crLF: false) + return (first: char, previous: previous, crLF: false) } let head = utf8[previous] guard head._isSub300StartingByte else { return nil } - // Handle CR-LF: - if char == ._carriageReturn && head == ._lineFeed { + // Handle CR-LF by reversing past the sequence if both characters are present + if char == ._lineFeed && head == ._carriageReturn { utf8.formIndex(before: &previous) guard previous == start || utf8[previous]._isSub300StartingByte else { return nil } - return (char: char, previous: previous, crLF: true) + return (first: char, previous: previous, crLF: true) } assert(self[idx].isASCII && self[idx] != "\r\n") - return (char: char, previous: previous, crLF: false) + return (first: char, previous: previous, crLF: false) } func _quickMatch( diff --git a/Tests/MatchingEngineTests/ASCIITests.swift b/Tests/MatchingEngineTests/ASCIITests.swift new file mode 100644 index 000000000..3854a4c5d --- /dev/null +++ b/Tests/MatchingEngineTests/ASCIITests.swift @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +@testable import _StringProcessing + +final class QuickASCIICharacterTests: XCTestCase { + func testHappyPath() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickASCIICharacter(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[sut.startIndex]) + XCTAssertEqual(nextIdx, sut.index(after: sut.startIndex)) + XCTAssertFalse(isCRLF) + } + + func testAtEnd() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickASCIICharacter(at: sut.endIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + func testNonASCIIChar() throws { + // Given + let sut = "é" + + // When + let result = sut._quickASCIICharacter(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + func testNextIsEnd() throws { + // Given + let sut = "foo" + let index = sut.index(before: sut.endIndex) + + // When + let result = sut._quickASCIICharacter(at: index, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(nextIdx, sut.endIndex) + XCTAssertFalse(isCRLF) + } + + // TODO: JH - Figure out how to test sub 300 starting bytes + func testIsCRLF() throws { + // Given + let sut = "\r\n" + + // When + let result = sut._quickASCIICharacter(at: sut.utf8.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[sut.startIndex]) + XCTAssertEqual(nextIdx, sut.endIndex) + XCTAssertTrue(isCRLF) + } +} + +final class QuickReverseASCIICharacterTests: XCTestCase { + func testHappyPath() throws { + // Given + let sut = "foo" + let index = sut.index(after: sut.startIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIdx, sut.startIndex) + XCTAssertFalse(isCRLF) + } + + func testAtStart() throws { + // Given + let sut = "foo" + + // When + let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + func testNonASCIIChar() throws { + // Given + let sut = "é" + + // When + let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + func testPreviousIsStart() throws { + // Given + let sut = "foo" + let index = sut.index(after: sut.startIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIdx, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIdx, sut.startIndex) + XCTAssertFalse(isCRLF) + } + + // TODO: JH - Figure out how to test sub 300 starting bytes + func testIsCRLF() throws { + // Given + let sut = "foo\r\n" + // Start at '\n' + let index = sut.utf8.index(before: sut.endIndex) + + // When + let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + + // Then + let (char, previousIndex, isCRLF) = try XCTUnwrap(result) + XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(previousIndex, sut.index(sut.startIndex, offsetBy: 2)) + XCTAssertTrue(isCRLF) + } +} diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift new file mode 100644 index 000000000..946eec3cd --- /dev/null +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -0,0 +1,251 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +@testable import _StringProcessing + +final class StringMatchingTests: XCTestCase { + // MARK: characterAndEnd tests + func testCharacterAndEnd_HappyPath() throws { + // Given + let sut = "foo" + + // When + let result = sut.characterAndEnd(at: sut.startIndex, limitedBy: sut.endIndex) + + // Then + let (char, nextIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "f") + XCTAssertEqual(nextIndex, sut.index(after: sut.startIndex)) + } + + func testCharacterAndEnd_SubcharacterMatch() throws { + // Given a string with 2 subcharacter positions in its utf8 view + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + let pos = sut.startIndex + let end = sut.utf8.index(after: sut.utf8.startIndex) + + // When + let result = sut.characterAndEnd(at: pos, limitedBy: end) + + // Then + let (char, nextIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "b") + XCTAssertEqual(nextIndex, end) + } + + func testCharacterAndEnd_SubcharacterMatchEmptyRounded() throws { + // Given a string with 3 sub-character positions in its utf8 view + // \u{62}\u{300}\u{316}\u{335}\u{65}\u{73}\u{74} + let sut = "b̵̖̀est" + + // And a range that doesn't touch a grapheme cluster boundary + // 1[utf8] (aka \u{300}) + let pos = sut.utf8.index(after: sut.startIndex) + // 2[utf8] (aka \u{316}) + let end = sut.utf8.index(sut.startIndex, offsetBy: 2) + + // When we try to get a character from a sub-character range + // of unicode scalars + let result = sut.characterAndEnd(at: pos, limitedBy: end) + + // Then `characterAndEnd` should return nil rather than an empty string + XCTAssertNil(result) + } + + func testCharacterAndEnd_atEnd() { + // Given + let sut = "foo" + + // When + let result = sut.characterAndEnd(at: sut.endIndex, limitedBy: sut.endIndex) + + // Then + XCTAssertNil(result) + } + + // MARK: characterAndStart tests + func testCharacterAndStart_HappyPath() throws { + // Given + let sut = "foo" + let pos = sut.index(before: sut.endIndex) + + // When + let result = sut.characterAndStart(at: pos, limitedBy: sut.startIndex) + + // Then + let (char, previousIndex) = try XCTUnwrap(result) + XCTAssertEqual(char, "o") + XCTAssertEqual(previousIndex, sut.index(before: pos)) + } + + // FIXME: JH - Two diacritical marks are considered a character. + // TODO: JH - Learn more about Substring rounding(?) +// func testCharacterAndStart_SubcharacterMatch() throws { +// // Given a string with 2 subcharacter positions in its utf8 view +// // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} +// let sut = "ab̖̀cd" +// +// // 3[utf8] (aka \u{316}) +// let pos = sut.utf8.index(sut.startIndex, offsetBy: 3) +// let start = sut.startIndex//utf8.index(before: pos) +// +// // When +// let result = sut.characterAndStart(at: pos, limitedBy: start) +// +// // Then +// XCTAssertNil(result) +// let (char, nextIndex) = try XCTUnwrap(result) +// XCTAssertEqual(char, "t") +// XCTAssertEqual(nextIndex, end) +// } +// +// func testCharacterAndStart_SubcharacterMatchEmptyRounded() throws { +// // Given a string with 3 sub-character positions in its utf8 view +// // \u{61}\u{62}\u{335}\u{300}\u{316}\u{63}\u{64} +// let sut = "ab̵̖̀cd" +// +// // And a range that doesn't touch a grapheme cluster boundary +// // 4[utf8] (aka \u{335}) +// let pos = sut.utf8.index(sut.startIndex, offsetBy: 4) +// // 3[utf8] (aka \u{300}) +// let start = sut.utf8.index(sut.startIndex, offsetBy: 3) +// +// // When we try to get a character from a sub-character range +// // of unicode scalars +// let result = sut.characterAndStart(at: pos, limitedBy: start) +// +// // Then `characterAndStart` should return nil rather than an empty string +// XCTAssertNil(result) +// } + + func testCharacterAndStart_atStart() { + // Given + let sut = "foo" + + // When + let result = sut.characterAndStart(at: sut.startIndex, limitedBy: sut.startIndex) + + // Then + XCTAssertNil(result) + } + + // MARK: matchAnyNonNewline tests + func testMatchAnyNonNewline() throws { + // Given + // A string without any newline characters + let sut = "bar" + // and any index other than `endIndex` + let pos = sut.index(before: sut.endIndex) + + // When we run the match: + let result = sut.matchAnyNonNewline( + at: pos, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then the next index should be `sut.endIndex` + let nextIndex = try XCTUnwrap(result) + XCTAssertEqual(nextIndex, sut.endIndex) + } + + func testMatchAnyNonNewline_Newline() throws { + // Given + // A string that has a newline character + let sut = "ba\nr" + // and the index of that newline character + let pos = try XCTUnwrap(sut.firstIndex(of: "\n")) + + // When we run the reverse match: + let result = sut.matchAnyNonNewline( + at: pos, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then we should get nil because the character at `pos` is a newline + XCTAssertNil(result) + } + + func testMatchAnyNonNewline_atEnd() throws { + // Given + // A string without any newline characters + let sut = "bar" + + // When we try to reverse match starting at `startIndex`: + let result = sut.matchAnyNonNewline( + at: sut.endIndex, + limitedBy: sut.endIndex, + isScalarSemantics: true + ) + + // Then we should get nil because there isn't an index before `startIndex` + XCTAssertNil(result) + } + + func testReverseMatchAnyNonNewline() throws { + // Given + // A string without any newline characters + let sut = "bar" + // and an index other than `startIndex` or `endIndex` + let pos = sut.index(before: sut.endIndex) + + // When we run the reverse match: + let result = sut.reverseMatchAnyNonNewline( + at: pos, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get a previous index + let previousIndex = try XCTUnwrap(result) + // The character at the previous index should be "a" + XCTAssertEqual(sut[previousIndex], "a") + } + + func testReverseMatchAnyNonNewline_Newline() throws { + // Given + // A string that has a newline character, + let sut = "ba\nr" + // and the index of that newline character + let pos = try XCTUnwrap(sut.firstIndex(of: "\n")) + + // When we run the reverse match: + let result = sut.reverseMatchAnyNonNewline( + at: pos, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get nil because the character at `pos` is a newline + XCTAssertNil(result) + } + + func testReverseMatchAnyNonNewline_atStart() throws { + // Given + // A string without any newline characters + let sut = "bar" + + // When we try to reverse match starting at `startIndex`: + let result = sut.reverseMatchAnyNonNewline( + at: sut.startIndex, + limitedBy: sut.startIndex, + isScalarSemantics: true + ) + + // Then we should get nil because there isn't an index before `startIndex` + XCTAssertNil(result) + } +} From 2d3c6912a894e3df53aa915f59e191d8825e5204 Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Wed, 1 Jan 2025 11:45:15 -0600 Subject: [PATCH 3/8] Add ASCII _quickMatch and _quickReverseMatch tests --- Sources/_StringProcessing/Unicode/ASCII.swift | 2 - Tests/MatchingEngineTests/ASCIITests.swift | 206 ++++++++++++++++++ 2 files changed, 206 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 26cbff4c3..efbe406e3 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -258,6 +258,4 @@ extension String { return (previous, asciiValue._asciiIsWord) } } - } - diff --git a/Tests/MatchingEngineTests/ASCIITests.swift b/Tests/MatchingEngineTests/ASCIITests.swift index 3854a4c5d..4af6bf28f 100644 --- a/Tests/MatchingEngineTests/ASCIITests.swift +++ b/Tests/MatchingEngineTests/ASCIITests.swift @@ -151,3 +151,209 @@ final class QuickReverseASCIICharacterTests: XCTestCase { XCTAssertTrue(isCRLF) } } + +final class ASCIIQuickMatchTests: XCTestCase { + func testAny() throws { + try _test(matching: .any, against: "!") + try _test(matching: .anyGrapheme, against: "!") + } + + func testDigit() throws { + try _test(matching: .digit, against: "1") + try _test(matching: .digit, against: "a", shouldMatch: false) + } + + func testHorizontalWhitespace() throws { + try _test(matching: .horizontalWhitespace, against: " ") + try _test(matching: .horizontalWhitespace, against: "\t") + try _test(matching: .horizontalWhitespace, against: "\n", shouldMatch: false) + } + + func testVerticalWhitespace() throws { + try _test(matching: .verticalWhitespace, against: "\n") + try _test(matching: .verticalWhitespace, against: "\t", shouldMatch: false) + try _test(matching: .newlineSequence, against: "\n") + try _test(matching: .newlineSequence, against: "\t", shouldMatch: false) + } + + func testVerticalWhitespaceMatchesCRLF() throws { + let crlf = "\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .verticalWhitespace, + against: crlf, + expectedNext: crlf.utf8.firstIndex(of: ._lineFeed) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .verticalWhitespace, + against: crlf, + isScalarSemantics: false + ) + } + + func testWhitespace() throws { + try _test(matching: .whitespace, against: " ") + try _test(matching: .whitespace, against: "\t") + try _test(matching: .whitespace, against: "\n") + try _test(matching: .whitespace, against: "a", shouldMatch: false) + } + + func testWhitespaceCRLF() throws { + // Given + let crlf = "\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .whitespace, + against: crlf, + expectedNext: crlf.utf8.firstIndex(of: ._lineFeed) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .whitespace, + against: crlf, + isScalarSemantics: false + ) + } + + func testWord() throws { + // Given + try _test(matching: .word, against: "a") + try _test(matching: .word, against: "1") + try _test(matching: .word, against: "_") + try _test(matching: .word, against: "-", shouldMatch: false) + } + + private func _test( + matching cc: _CharacterClassModel.Representation, + against sut: String, + isScalarSemantics: Bool = true, + shouldMatch: Bool = true, + expectedNext: String.Index? = nil + ) throws { + // When + let result = sut._quickMatch( + cc, + at: sut.startIndex, + limitedBy: sut.endIndex, + isScalarSemantics: isScalarSemantics + ) + + // Then + let (next, matched) = try XCTUnwrap(result) + XCTAssertEqual(matched, shouldMatch) + XCTAssertEqual(next, expectedNext ?? sut.endIndex) + } +} + +final class ASCIIQuickReverseMatchTests: XCTestCase { + func testAny() throws { + try _test(matching: .any, against: "1!") + try _test(matching: .anyGrapheme, against: "1!") + } + + func testDigit() throws { + try _test(matching: .digit, against: "a1") + try _test(matching: .digit, against: "1a", shouldMatch: false) + } + + func testHorizontalWhitespace() throws { + try _test(matching: .horizontalWhitespace, against: "a ") + try _test(matching: .horizontalWhitespace, against: "a\t") + try _test(matching: .horizontalWhitespace, against: "a\n", shouldMatch: false) + } + + func testVerticalWhitespace() throws { + try _test(matching: .verticalWhitespace, against: "a\n") + try _test(matching: .verticalWhitespace, against: "a\t", shouldMatch: false) + } + + func testVerticalWhitespaceMatchesCRLF() throws { + let sut = "a\r\n" + + // When using scalar semantics: + // The next index should be the index of the "\n" character + try _test( + matching: .verticalWhitespace, + against: sut, + at: sut.utf8.index(before: sut.utf8.endIndex), + expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) + ) + + // When not using scalar semantics: + // The next index should be the index after the whole \r\n sequence (the end index) + try _test( + matching: .verticalWhitespace, + against: sut, + isScalarSemantics: false + ) + } + + func testWhitespace() throws { + try _test(matching: .whitespace, against: "a ") + try _test(matching: .whitespace, against: "a\t") + try _test(matching: .whitespace, against: "a\n") + try _test(matching: .whitespace, against: " a", shouldMatch: false) + } + + func testWhitespaceCRLF() throws { + // Given + let sut = "a\r\n" + + // When using scalar semantics: + // The previous index should be the index of the "\r" character + try _test( + matching: .whitespace, + against: sut, + at: sut.utf8.index(before: sut.utf8.endIndex), + expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) + ) + + // When not using scalar semantics: + // The previous index should be the index before the whole \r\n sequence + // (the start index) + try _test( + matching: .whitespace, + against: sut, + isScalarSemantics: false + ) + } + + func testWord() throws { + // Given + try _test(matching: .word, against: "!a") + try _test(matching: .word, against: "!1") + try _test(matching: .word, against: "!_") + try _test(matching: .word, against: "a-", shouldMatch: false) + } + + private func _test( + matching cc: _CharacterClassModel.Representation, + against sut: String, + at index: String.Index? = nil, + isScalarSemantics: Bool = true, + shouldMatch: Bool = true, + expectedPrevious: String.Index? = nil + ) throws { + // When + let result = sut._quickReverseMatch( + cc, + at: index ?? sut.index(before: sut.endIndex), + limitedBy: sut.startIndex, + isScalarSemantics: isScalarSemantics + ) + + // Then + let (previous, matched) = try XCTUnwrap(result) + XCTAssertEqual(matched, shouldMatch) + XCTAssertEqual(previous, expectedPrevious ?? sut.startIndex) + } +} From f28e9fa9fe939c48046688e0e265369e34e9795e Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Mon, 20 Jan 2025 11:45:00 -0600 Subject: [PATCH 4/8] Unit test matchScalar and reversMatchScalar --- .../_StringProcessing/Engine/MEBuiltins.swift | 8 +- .../_StringProcessing/Engine/Processor.swift | 5 +- .../MatchingEngineTests.swift | 268 ++++++++++++++++++ Tests/RegexTests/MatchTests.swift | 11 +- 4 files changed, 280 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index d6d77e749..691de6ef7 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -416,7 +416,7 @@ extension String { isStrictASCII: Bool, isScalarSemantics: Bool ) -> String.Index? { - guard currentPosition >= start else { return nil } + guard currentPosition > start else { return nil } if case .definite(let result) = _quickReverseMatchBuiltinCC( cc, at: currentPosition, @@ -443,6 +443,7 @@ extension String { isScalarSemantics: isScalarSemantics) } + // TODO: JH - Is there any value in testing this? How would it be tested? // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) private func _quickMatchBuiltinCC( @@ -450,7 +451,7 @@ extension String { at currentPosition: String.Index, limitedBy end: String.Index, isInverted: Bool, - isStrictASCII: Bool, + isStrictASCII: Bool, // TODO: JH - Is this just reserved for future use? A relic of the past? isScalarSemantics: Bool ) -> QuickResult { assert(currentPosition < end) @@ -474,7 +475,7 @@ extension String { isStrictASCII: Bool, isScalarSemantics: Bool ) -> QuickResult { - assert(currentPosition >= start) + assert(currentPosition > start) guard let (previous, result) = _quickReverseMatch( cc, at: currentPosition, @@ -486,6 +487,7 @@ extension String { return .definite(result == isInverted ? nil : previous) } + // TODO: JH - How can this be unit tested? // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) private func _thoroughMatchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index e5c2b54c6..b76b55b7d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -232,7 +232,7 @@ extension Processor { return true } - // If `start` falls in the middle of a character, and we are trying to advance + // If `start` falls in the middle of a character, and we are trying to reverse // by one "character", then we should max out at `start` even though the above // reversal will result in `nil`. if n == 1, let idx = input.unicodeScalars.index( @@ -994,7 +994,7 @@ extension String { ) -> Index? { // TODO: extremely quick-check-able // TODO: can be sped up with string internals - guard pos >= start else { return nil } + guard pos > start else { return nil } let curScalar = unicodeScalars[pos] if isCaseInsensitive { @@ -1006,7 +1006,6 @@ extension String { guard curScalar == scalar else { return nil } } - guard pos != start else { return pos } let idx = unicodeScalars.index(before: pos) assert(idx >= start, "Input is a substring with a sub-scalar startIndex.") diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index 946eec3cd..ee11f613f 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -248,4 +248,272 @@ final class StringMatchingTests: XCTestCase { // Then we should get nil because there isn't an index before `startIndex` XCTAssertNil(result) } + + func testMatchBuiltinCCAtEnd() { + // Given + let sut = "" + + // When + let next = sut.matchBuiltinCC( + .any, + at: sut.endIndex, + limitedBy: sut.endIndex, + isInverted: false, + isStrictASCII: false, + isScalarSemantics: true + ) + + // Then + XCTAssertNil(next) + } +} + +// MARK: matchScalar tests +extension StringMatchingTests { + func testMatchScalar() { + // Given + let sut = "bar" + + // When + let next = sut.matchScalar( + "b", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(next, sut.index(after: sut.startIndex)) + } + + func testMatchScalarNoMatch() { + // Given + let sut = "bar" + + // When + let next = sut.matchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarCaseInsensitive() { + // Given + let sut = "BAR" + + // When + let next = sut.matchScalar( + "b", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertEqual(next, sut.index(after: sut.startIndex)) + } + + func testMatchScalarCaseInsensitiveNoMatch() { + // Given + let sut = "BAR" + + // When + let next = sut.matchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarAtEnd() { + // Given + let sut = "" + + // When + let next = sut.matchScalar( + "a", + at: sut.endIndex, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchScalar( + "\u{300}", + at: sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex), + limitedBy: sut.endIndex, + boundaryCheck: true, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchScalarNoBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + let atPos = sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex) + + // When + let next = sut.matchScalar( + "\u{300}", + at: atPos, + limitedBy: sut.endIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(next, sut.unicodeScalars.index(after: atPos)) + } +} + +// MARK: reverseMatchScalar tests +extension StringMatchingTests { + func testReverseMatchScalar() { + // Given + let sut = "bar" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(previous, sut.startIndex) + } + + func testReverseMatchScalarNoMatch() { + // Given + let sut = "bar" + + // When + let previous = sut.reverseMatchScalar( + "b", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarCaseInsensitive() { + // Given + let sut = "BAR" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertEqual(previous, sut.startIndex) + } + + func testReverseMatchScalarCaseInsensitiveNoMatch() { + // Given + let sut = "BAR" + + // When + let previous = sut.reverseMatchScalar( + "b", + at: sut.index(after: sut.startIndex), + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: true + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarAtStart() { + // Given + let sut = "a" + + // When + let previous = sut.reverseMatchScalar( + "a", + at: sut.startIndex, + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + + // When + let previous = sut.reverseMatchScalar( + "\u{316}", + at: sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3), + limitedBy: sut.startIndex, + boundaryCheck: true, + isCaseInsensitive: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchScalarNoBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + let atPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3) + + // When + let previous = sut.reverseMatchScalar( + "\u{316}", + at: atPos, + limitedBy: sut.startIndex, + boundaryCheck: false, + isCaseInsensitive: false + ) + + // Then + XCTAssertEqual(previous, sut.unicodeScalars.index(before: atPos)) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 3c8072e92..ae739dd7f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1637,12 +1637,11 @@ extension RegexTests { ("123defg", nil) ) -// FIXME: quickMatch and thoroughMatch have different results -// firstMatchTest( -// #"(?<=\d{1,3}-.{1,3}-\d{1,3})suffix"#, -// input: "123-_+/-789suffix", -// match: "suffix" -// ) + firstMatchTest( + #"(?<=\d{1,3}-.{1,3}-\d{1,3})suffix"#, + input: "123-_+/-789suffix", + match: "suffix" + ) firstMatchTests( #"(?<=^\d{1,3})abc"#, From 41f331dfb58c169c3f9e0651e69eedf081ab97a0 Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Tue, 28 Jan 2025 17:48:10 -0600 Subject: [PATCH 5/8] Add reverseMatchUTF8 --- Sources/_StringProcessing/ByteCodeGen.swift | 6 +- .../Engine/Instruction.swift | 12 ++ .../_StringProcessing/Engine/MEBuilder.swift | 5 +- .../_StringProcessing/Engine/Processor.swift | 50 ++++- .../MatchingEngineTests.swift | 204 +++++++++++++++++- Tests/RegexTests/CompileTests.swift | 7 +- 6 files changed, 270 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 885573662..7569f7489 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -139,7 +139,11 @@ fileprivate extension Compiler.ByteCodeGen { // ASCII value) if s.utf8.count >= longThreshold, !options.isCaseInsensitive { let boundaryCheck = options.semanticLevel == .graphemeCluster - builder.buildMatchUTF8(Array(s.utf8), boundaryCheck: boundaryCheck) + builder.buildMatchUTF8( + Array(s.utf8), + boundaryCheck: boundaryCheck, + reverse: reverse + ) return } } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index d3a3d5fad..9adc23da7 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -134,6 +134,7 @@ extension Instruction { /// /// Operands: Scalar value to match against and booleans case reverseMatchScalar + /// Match directly (binary semantics) against a series of UTF-8 bytes /// /// NOTE: Compiler should ensure to only emit this instruction when normalization @@ -145,6 +146,17 @@ extension Instruction { /// matchUTF8(_: UTF8Register, boundaryCheck: Bool) case matchUTF8 + /// Reverse match directly (binary semantics) against a series of UTF-8 bytes + /// + /// NOTE: Compiler should ensure to only emit this instruction when normalization + /// is not required. E.g., scalar-semantic mode or when the matched portion is entirely ASCII + /// (which is invariant under NFC). Similary, this is case-sensitive. + /// + /// TODO: should we add case-insensitive? + /// + /// reverseMatchUTF8(_: UTF8Register, boundaryCheck: Bool) + case reverseMatchUTF8 + /// Match a character or a scalar against a set of valid ascii values stored in a bitset /// /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 5efad688a..07a685007 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -209,8 +209,9 @@ extension MEProgram.Builder { opcode, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchUTF8(_ utf8: Array, boundaryCheck: Bool) { - instructions.append(.init(.matchUTF8, .init( + mutating func buildMatchUTF8(_ utf8: Array, boundaryCheck: Bool, reverse: Bool) { + let opcode = reverse ? Instruction.OpCode.reverseMatchUTF8 : .matchUTF8 + instructions.append(.init(opcode, .init( utf8: utf8Contents.store(utf8), boundaryCheck: boundaryCheck))) } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index b76b55b7d..6a5244793 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -429,6 +429,24 @@ extension Processor { return true } + // TODO: bytes should be a Span or RawSpan + mutating func reverseMatchUTF8( + _ bytes: Array, + boundaryCheck: Bool + ) -> Bool { + guard let previous = input.reverseMatchUTF8( + bytes, + at: currentPosition, + limitedBy: start, + boundaryCheck: boundaryCheck + ) else { + signalFailure() + return false + } + currentPosition = previous + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -721,6 +739,15 @@ extension Processor { controller.step() } + case .reverseMatchUTF8: + let (utf8Reg, boundaryCheck) = payload.matchUTF8Payload + let utf8Content = registers[utf8Reg] + if reverseMatchUTF8( + utf8Content, boundaryCheck: boundaryCheck + ) { + controller.step() + } + case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] @@ -1028,7 +1055,28 @@ extension String { self.utf8.formIndex(after: &cur) } - guard cur <= end else { return nil } + assert(cur <= end) + + if boundaryCheck && !isOnGraphemeClusterBoundary(cur) { + return nil + } + + return cur + } + + func reverseMatchUTF8( + _ bytes: Array, + at pos: Index, + limitedBy start: Index, + boundaryCheck: Bool + ) -> Index? { + var cur = pos + for b in bytes.reversed() { + guard cur > start, self.utf8[cur] == b else { return nil } + self.utf8.formIndex(before: &cur) + } + + assert(cur > start) if boundaryCheck && !isOnGraphemeClusterBoundary(cur) { return nil diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index ee11f613f..8785b14e8 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -355,7 +355,8 @@ extension StringMatchingTests { XCTAssertNil(next) } - func testMatchScalarBoundaryCheck() { + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testMatchScalarFailsBoundaryCheck() { // Given // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} let sut = "b̖̀est" @@ -377,19 +378,19 @@ extension StringMatchingTests { // Given // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} let sut = "b̖̀est" - let atPos = sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex) + let startPos = sut.unicodeScalars.index(after: sut.unicodeScalars.startIndex) // When let next = sut.matchScalar( "\u{300}", - at: atPos, + at: startPos, limitedBy: sut.endIndex, boundaryCheck: false, isCaseInsensitive: false ) // Then - XCTAssertEqual(next, sut.unicodeScalars.index(after: atPos)) + XCTAssertEqual(next, sut.unicodeScalars.index(after: startPos)) } } @@ -480,7 +481,8 @@ extension StringMatchingTests { XCTAssertNil(previous) } - func testReverseMatchScalarBoundaryCheck() { + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testReverseMatchScalarFailsBoundaryCheck() { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} let sut = "ab̖̀cd" @@ -502,18 +504,204 @@ extension StringMatchingTests { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} let sut = "ab̖̀cd" - let atPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3) + let startPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3) // When let previous = sut.reverseMatchScalar( "\u{316}", - at: atPos, + at: startPos, limitedBy: sut.startIndex, boundaryCheck: false, isCaseInsensitive: false ) // Then - XCTAssertEqual(previous, sut.unicodeScalars.index(before: atPos)) + XCTAssertEqual(previous, sut.unicodeScalars.index(before: startPos)) + } +} + +// MARK: matchUTF8 tests +extension StringMatchingTests { + func testMatchUTF8() { + // Given + let sut = "quotedliteral" + let needle = Array(sut.prefix(3).utf8) + + // When + let next = sut.matchUTF8( + needle, + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(next, sut.index(sut.startIndex, offsetBy: 3)) + } + + func testMatchUTF8NoMatch() { + // Given + let haystack = "quotedliteral" + let needle = Array("\(haystack.prefix(2))a".utf8) + + // When + let next = haystack.matchUTF8( + needle, + at: haystack.startIndex, + limitedBy: haystack.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertNil(next) + } + + func testMatchUTF8MatchPastEnd() { + // Given + let haystack = "quotedliteral" + let needle = Array(haystack.prefix(3).utf8) + + // When + let next = haystack.matchUTF8( + needle, + at: haystack.startIndex, + limitedBy: haystack.index(haystack.startIndex, offsetBy: 2), + boundaryCheck: false + ) + + // Then + XCTAssertNil(next) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testMatchUTF8FailsBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchUTF8( + Array("\u{62}".utf8), + at: sut.unicodeScalars.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: true + ) + + // Then + XCTAssertNil(next) + } + + func testMatchUTF8NoBoundaryCheck() { + // Given + // \u{62}\u{300}\u{316}\u{65}\u{73}\u{74} + let sut = "b̖̀est" + + // When + let next = sut.matchUTF8( + Array("\u{62}".utf8), + at: sut.startIndex, + limitedBy: sut.endIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(next, sut.unicodeScalars.index(after: sut.startIndex)) + } +} + +// MARK: reverseMatchUTF8 tests +extension StringMatchingTests { + func testReverseMatchUTF8() { + // Given + let sut = "quotedliteral" + let needle = Array(sut.suffix(3).utf8) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: sut.index(before: sut.endIndex), + limitedBy: sut.startIndex, + boundaryCheck: false + ) + + // Then + XCTAssertEqual(previous, sut.index(sut.endIndex, offsetBy: -4)) + } + + func testReverseMatchUTF8NoMatch() { + // Given + let haystack = "quotedliteral" + let needle = Array("\(haystack.suffix(2))a".utf8) + + // When + let previous = haystack.reverseMatchUTF8( + needle, + at: haystack.index(before: haystack.endIndex), + limitedBy: haystack.startIndex, + boundaryCheck: false + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchUTF8MatchPastStart() { + // Given + let haystack = "quotedliteral" + let needle = Array(haystack.suffix(3).utf8) + + // When + let previous = haystack.reverseMatchUTF8( + needle, + at: haystack.index(haystack.endIndex, offsetBy: -1), + limitedBy: haystack.index(haystack.unicodeScalars.endIndex, offsetBy: -2), + boundaryCheck: false + ) + + // Then + XCTAssertNil(previous) + } + + // TODO: JH - Write test for when the boundary check passes/check if that's already covered + func testReverseMatchUTF8FailsBoundaryCheck() { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + let sut = "ab̖̀cd" + let needle = Array("\u{316}".utf8) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: sut.utf8.index(sut.utf8.endIndex, offsetBy: -3), + limitedBy: sut.startIndex, + boundaryCheck: true + ) + + // Then + XCTAssertNil(previous) + } + + func testReverseMatchUTF8NoBoundaryCheck() throws { + // Given + // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} + // utf8 = [97, 98, 204, 128, 204, 150, 99, 100] + let sut = "ab̖̀cd" + // utf8 = [204, 150] + let needle = Array("\u{316}".utf8) + // Position of \u{316} = 5[utf8] + let startPos = sut.utf8.index(sut.utf8.endIndex, offsetBy: -3) + + // When + let previous = sut.reverseMatchUTF8( + needle, + at: startPos, + limitedBy: sut.startIndex, + boundaryCheck: false + ) + + // Then + // TODO: JH - Is there a better way to write this assertion? + // Previous should be the second byte of \u{300} + XCTAssertEqual(sut.utf8[previous!], 128) } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6ea7da996..437c7e669 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -53,10 +53,11 @@ enum DecodedInstr { case quantify case reverse case reverseMatch - case reverseMatchScalar + case reverseMatchAnyNonNewline case reverseMatchBitset case reverseMatchBuiltin - case reverseMatchAnyNonNewline + case reverseMatchScalar + case reverseMatchUTF8 case reverseQuantify } @@ -165,6 +166,8 @@ extension DecodedInstr { return .reverseQuantify case .matchUTF8: return .matchUTF8 + case .reverseMatchUTF8: + return .reverseMatchUTF8 } } } From 72bf9d7fe02c0cfd905635cea3e8415447c99fe8 Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Mon, 17 Feb 2025 11:32:05 -0600 Subject: [PATCH 6/8] Reverse new quant impl --- .../_StringProcessing/Engine/MEBuiltins.swift | 19 + .../Engine/MEReverseQuantify.swift | 602 ++++++++++++++---- .../_StringProcessing/Engine/Processor.swift | 18 +- Sources/_StringProcessing/Utility/Misc.swift | 9 + 4 files changed, 500 insertions(+), 148 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 691de6ef7..80f4217ed 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -368,6 +368,25 @@ extension String { limitedBy: end, isScalarSemantics: isScalarSemantics) } + + internal func reverseMatchRegexDot( + at currentPosition: Index, + limitedBy start: Index, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> Index? { + guard currentPosition > start else { return nil } + + if anyMatchesNewline { + return index( + before: currentPosition, isScalarSemantics: isScalarSemantics) + } + + return reverseMatchAnyNonNewline( + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } } // MARK: - Built-in character class matching diff --git a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift index 5f1afb1bc..f58e8ea2c 100644 --- a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift @@ -1,177 +1,517 @@ +internal import _RegexParser + +private typealias ASCIIBitset = DSLTree.CustomCharacterClass.AsciiBitset + extension Processor { - func _doReverseQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { + internal mutating func runReverseQuantify(_ payload: QuantifyPayload) -> Bool { + assert(payload.quantKind != .reluctant, ".reluctant is not supported by .quantify") + + let minMatches = payload.minTrips + let maxMatches = payload.maxTrips + let produceSavePointRange = payload.quantKind == .eager let isScalarSemantics = payload.isScalarSemantics + let isZeroOrMore = payload.minTrips == 0 && payload.maxExtraTrips == nil + let isOneOrMore = payload.minTrips == 1 && payload.maxExtraTrips == nil + + let matchResult: (previous: String.Index, savePointRange: Range?)? switch payload.type { case .asciiBitset: - return input.reverseMatchASCIIBitset( - registers[payload.bitset], - at: currentPosition, - limitedBy: start, - isScalarSemantics: isScalarSemantics) + if isZeroOrMore { + matchResult = input.reverseMatchZeroOrMoreASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.reverseMatchOneOrMoreASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.reverseMatchQuantifiedASCIIBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } + case .asciiChar: - return input.reverseMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), - at: currentPosition, - limitedBy: start, - boundaryCheck: !isScalarSemantics, - isCaseInsensitive: false) - case .builtinCC: - guard currentPosition >= start else { return nil } + if isZeroOrMore { + matchResult = input.reverseMatchZeroOrMoreScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.reverseMatchOneOrMoreScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.reverseMatchQuantifiedScalar( + Unicode.Scalar(payload.asciiChar), + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics) + } - // We only emit .quantify if it consumes a single character - return input.reverseMatchBuiltinCC( - payload.builtinCC, - at: currentPosition, - limitedBy: start, - isInverted: payload.builtinIsInverted, - isStrictASCII: payload.builtinIsStrict, - isScalarSemantics: isScalarSemantics) case .any: - guard currentPosition >= start else { return nil } + if isZeroOrMore { + matchResult = input.reverseMatchZeroOrMoreRegexDot( + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.reverseMatchOneOrMoreRegexDot( + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.reverseMatchQuantifiedRegexDot( + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + anyMatchesNewline: payload.anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } - if payload.anyMatchesNewline { - if isScalarSemantics { - return input.unicodeScalars.index(before: currentPosition) - } - return input.index(before: currentPosition) + case .builtinCC: + if isZeroOrMore { + matchResult = input.reverseMatchZeroOrMoreBuiltinCC( + payload.builtinCC, + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } else if isOneOrMore { + matchResult = input.reverseMatchOneOrMoreBuiltinCC( + payload.builtinCC, + at: currentPosition, + limitedBy: start, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) + } else { + matchResult = input.reverseMatchQuantifiedBuiltinCC( + payload.builtinCC, + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isInverted: payload.builtinIsInverted, + isStrictASCII: payload.builtinIsStrict, + isScalarSemantics: isScalarSemantics) } + } - return input.reverseMatchAnyNonNewline( - at: currentPosition, - limitedBy: start, - isScalarSemantics: isScalarSemantics) + guard let (previous, savePointRange) = matchResult else { + signalFailure() + return false } + if let savePointRange { + assert(produceSavePointRange) + savePoints.append(makeQuantifiedSavePoint( + savePointRange, isScalarSemantics: payload.isScalarSemantics)) + } + currentPosition = previous + return true } +} - /// Generic bounded reverseQuantify instruction interpreter - /// - Handles .eager and .posessive - /// - Handles arbitrary minTrips and maxExtraTrips - mutating func runReverseQuantify(_ payload: QuantifyPayload) -> Bool { - assert(payload.quantKind != .reluctant) +/// MARK: - Non-reluctant quantification operations on String - var trips = 0 - var maxExtraTrips = payload.maxExtraTrips +extension String { + /// Run the quant loop, using the supplied matching closure + /// + /// NOTE: inline-always to help elimiate the closure overhead, + /// simplify some of the looping structure, etc. + @inline(__always) + fileprivate func _runReverseQuantLoop( + at currentPosition: Index, + limitedBy start: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool, + _ doMatch: ( + _ currentPosition: Index, _ limitedBy: Index, _ isScalarSemantics: Bool + ) -> Index? + ) -> (previous: Index, savePointRange: Range?)? { + var currentPosition = currentPosition - while trips < payload.minTrips { - guard let previous = _doReverseQuantifyMatch(payload) else { - signalFailure() - return false - } + // The range of backtracking positions to try. For zero-or-more, starts + // before any match happens. Always ends before the final match, since + // the final match is what is tried without backtracking. An empty range + // is valid and means a single backtracking position at rangeStart. + var rangeStart = currentPosition + var rangeEnd = currentPosition - currentPosition = previous + var numMatches = 0 - // If we've reached the start of the string but still have more trips, fail - if currentPosition == start, trips < payload.minTrips { - signalFailure() - return false + while numMatches < maxMatches { + guard let previous = doMatch( + currentPosition, start, isScalarSemantics + ) else { + break } - - trips += 1 + numMatches &+= 1 + if numMatches == minMatches { + // For this loop iteration, rangeStart will actually trail rangeEnd by + // a single match position. Next iteration, they will be equal + // (empty range denoting a single backtracking point). Note that we + // only ever return a range if we have exceeded `minMatches`; if we + // exactly match `minMatches` there is no backtracking positions to + // remember. + rangeEnd = previous + } + rangeStart = currentPosition + currentPosition = previous + assert(currentPosition > rangeEnd) } - // If we don't have any more trips to take: - if maxExtraTrips == 0 { - // We're done - return true + guard numMatches >= minMatches else { + return nil } - // We've already consumed the minimum number of characters, - // If we can't get another match, the reverse quantify was successful - guard let previous = _doReverseQuantifyMatch(payload) else { - return true + guard produceSavePointRange && numMatches > minMatches else { + // No backtracking positions to try + return (currentPosition, nil) } - maxExtraTrips = maxExtraTrips.map { $0 - 1 } + assert(rangeStart <= rangeEnd) - // Remember the range of valid positions in case we can create a quantified - // save point - var rangeStart = currentPosition - let rangeEnd = currentPosition - currentPosition = previous + // NOTE: We can't assert that rangeEnd trails currentPosition by exactly + // one position, because newline-sequence in scalar semantic mode still + // matches two scalars - while true { - if maxExtraTrips == 0 { break } + return ( + currentPosition, + Range(uncheckedBounds: (lower: rangeStart, upper: rangeEnd)) + ) + } - guard let previous = _doReverseQuantifyMatch(payload) else { - break - } - maxExtraTrips = maxExtraTrips.map({$0 - 1}) - rangeStart = currentPosition - currentPosition = previous - } + // NOTE: [Zero|One]OrMore overloads are to specialize the inlined run loop, + // which has a perf impact. At the time of writing this, 10% for + // zero-or-more and 5% for one-or-more improvement, which could very well + // be much higher if/when the inner match functions are made faster. - if payload.quantKind == .eager { - savePoints.append(makeQuantifiedSavePoint( - rangeStart.. (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) } - return true } - - /// Specialized quantify instruction interpreter for `*`, always succeeds - mutating func runEagerZeroOrMoreReverseQuantify(_ payload: QuantifyPayload) { - assert(payload.quantKind == .eager - && payload.minTrips == 0 - && payload.maxExtraTrips == nil) - _doRunEagerZeroOrMoreReverseQuantify(payload) + fileprivate func reverseMatchOneOrMoreASCIIBitset( + _ asciiBitset: ASCIIBitset, + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) + } } - // NOTE: So-as to inline into one-or-more call, which makes a significant - // performance difference - @inline(__always) - mutating func _doRunEagerZeroOrMoreReverseQuantify(_ payload: QuantifyPayload) { - guard let previous = _doReverseQuantifyMatch(payload) else { - // Consumed no input, no point saved - return + fileprivate func reverseMatchQuantifiedASCIIBitset( + _ asciiBitset: ASCIIBitset, + at currentPosition: Index, + limitedBy start: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchASCIIBitset( + asciiBitset, + at: currentPosition, + limitedBy: start, + isScalarSemantics: isScalarSemantics) } + } - // Create a quantified save point for every part of the input matched up - // to the final position. - var rangeStart = currentPosition - let rangeEnd = currentPosition - currentPosition = previous - while true { - guard let previous = _doReverseQuantifyMatch(payload) else { break } - rangeStart = currentPosition - currentPosition = previous + fileprivate func reverseMatchZeroOrMoreScalar( + _ scalar: Unicode.Scalar, + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchScalar( + scalar, + at: currentPosition, + limitedBy: start, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) + } + } + fileprivate func reverseMatchOneOrMoreScalar( + _ scalar: Unicode.Scalar, + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchScalar( + scalar, + at: currentPosition, + limitedBy: start, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) } - - savePoints.append(makeQuantifiedSavePoint(rangeStart.. Bool { - assert(payload.quantKind == .eager - && payload.minTrips == 1 - && payload.maxExtraTrips == nil) + fileprivate func reverseMatchQuantifiedScalar( + _ scalar: Unicode.Scalar, + at currentPosition: Index, + limitedBy start: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchScalar( + scalar, + at: currentPosition, + limitedBy: start, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) - // Match at least once - guard let previous = _doReverseQuantifyMatch(payload) else { - signalFailure() - return false } + } - // Run `a+` as `aa*` - currentPosition = previous - _doRunEagerZeroOrMoreReverseQuantify(payload) - return true + fileprivate func reverseMatchZeroOrMoreBuiltinCC( + _ builtinCC: _CharacterClassModel.Representation, + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func reverseMatchOneOrMoreBuiltinCC( + _ builtinCC: _CharacterClassModel.Representation, + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) + } } - /// Specialized quantify instruction interpreter for ? - mutating func runZeroOrOneReverseQuantify(_ payload: QuantifyPayload) -> Bool { - assert(payload.minTrips == 0 - && payload.maxExtraTrips == 1) - let previous = _doReverseQuantifyMatch(payload) - guard let idx = previous else { - return true // matched zero times + fileprivate func reverseMatchQuantifiedBuiltinCC( + _ builtinCC: _CharacterClassModel.Representation, + at currentPosition: Index, + limitedBy start: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + isInverted: Bool, + isStrictASCII: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchBuiltinCC( + builtinCC, + at: currentPosition, + limitedBy: start, + isInverted: isInverted, + isStrictASCII: isStrictASCII, + isScalarSemantics: isScalarSemantics) } - if payload.quantKind != .possessive { - // Save the zero match - savePoints.append(makeSavePoint(resumingAt: currentPC+1)) + } + + fileprivate func reverseMatchZeroOrMoreRegexDot( + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 0, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchRegexDot( + at: currentPosition, + limitedBy: start, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } + } + fileprivate func reverseMatchOneOrMoreRegexDot( + at currentPosition: Index, + limitedBy start: Index, + produceSavePointRange: Bool, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: 1, + maxMatches: UInt64.max, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchRegexDot( + at: currentPosition, + limitedBy: start, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) + } + } + + fileprivate func reverseMatchQuantifiedRegexDot( + at currentPosition: Index, + limitedBy start: Index, + minMatches: UInt64, + maxMatches: UInt64, + produceSavePointRange: Bool, + anyMatchesNewline: Bool, + isScalarSemantics: Bool + ) -> (previous: Index, savePointRange: Range?)? { + _runReverseQuantLoop( + at: currentPosition, + limitedBy: start, + minMatches: minMatches, + maxMatches: maxMatches, + produceSavePointRange: produceSavePointRange, + isScalarSemantics: isScalarSemantics + ) { currentPosition, start, isScalarSemantics in + reverseMatchRegexDot( + at: currentPosition, + limitedBy: start, + anyMatchesNewline: anyMatchesNewline, + isScalarSemantics: isScalarSemantics) } - currentPosition = idx - return true } } + + diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 6a5244793..79e732a70 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -785,23 +785,7 @@ extension Processor { controller.step() } case .reverseQuantify: - let quantPayload = payload.quantify - let matched: Bool - switch (quantPayload.quantKind, quantPayload.minTrips, quantPayload.maxExtraTrips) { - case (.reluctant, _, _): - assertionFailure(".reluctant is not supported by .quantify") - return - case (.eager, 0, nil): - runEagerZeroOrMoreReverseQuantify(quantPayload) - matched = true - case (.eager, 1, nil): - matched = runEagerOneOrMoreReverseQuantify(quantPayload) - case (_, 0, 1): - matched = runZeroOrOneReverseQuantify(quantPayload) - default: - matched = runReverseQuantify(quantPayload) - } - if matched { + if runReverseQuantify(payload.quantify) { controller.step() } diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index d63370b55..191f09d4c 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -74,6 +74,15 @@ extension String { return index(after: idx) } } + + /// Index before in either grapheme or scalar view + func index(before idx: Index, isScalarSemantics: Bool) -> Index { + if isScalarSemantics { + return unicodeScalars.index(before: idx) + } else { + return index(before: idx) + } + } } From c84f3392e1eb1cbd2c3b6f0d7096792fb2254c6b Mon Sep 17 00:00:00 2001 From: Jacob Hearst Date: Mon, 26 May 2025 10:01:09 -0500 Subject: [PATCH 7/8] Save point --- Sources/_StringProcessing/ByteCodeGen.swift | 5 +- .../_StringProcessing/Engine/MEBuiltins.swift | 89 +++++---- .../Engine/MEReverseQuantify.swift | 14 +- .../_StringProcessing/Engine/Metrics.swift | 49 ++--- .../_StringProcessing/Engine/Processor.swift | 41 ++-- Sources/_StringProcessing/Executor.swift | 6 +- Sources/_StringProcessing/Unicode/ASCII.swift | 55 +++--- Tests/MatchingEngineTests/ASCIITests.swift | 183 +++++++++--------- .../MatchingEngineTests.swift | 104 +++++----- Tests/RegexTests/MatchTests.swift | 34 ++-- 10 files changed, 289 insertions(+), 291 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 7569f7489..9646a990a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -368,9 +368,8 @@ fileprivate extension Compiler.ByteCodeGen { throw Unsupported("Lookarounds with custom consumers") } + options.beginScope() if !kind.forwards { - defer { options.endScope() } - options.beginScope() // TODO: JH - Is it okay to use .fake here? options.apply(.init(adding: [.init(.reverse, location: .fake)])) } @@ -380,6 +379,8 @@ fileprivate extension Compiler.ByteCodeGen { } else { try emitNegativeLookaround(child) } + + options.endScope() } mutating func emitPositiveLookaround(_ child: DSLTree.Node) throws { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 80f4217ed..e28a33fe8 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -36,7 +36,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard currentPosition >= start, let previous = input.reverseMatchBuiltinCC( + guard currentPosition >= start, let previous = input.matchPreviousBuiltinCC( cc, at: currentPosition, limitedBy: start, @@ -182,11 +182,11 @@ extension String { : (substr.first!, substr.endIndex) } - /// Returns the character at `pos`, bounded by `start`, as well as the lower - /// boundary of the returned character. - /// + /// Returns the character before `pos`, bounded by `start`, as well as that + /// character's index. + /// /// This function handles loading a character from a string while respecting - /// an start boundary, even if that start boundary is sub-character or sub-scalar. + /// a start boundary, even if that start boundary is sub-character or sub-scalar. /// /// - If `pos` is at or past `start`, this function returns `nil`. /// - If `start` is between `pos` and the next grapheme cluster boundary (i.e., @@ -204,15 +204,15 @@ extension String { /// - Returns: The character at `pos`, bounded by `start`, if it exists, along /// with the lower bound of that character. The lower bound is always /// scalar-aligned. - func characterAndStart( - at pos: String.Index, + func character( + before pos: String.Index, limitedBy start: String.Index - ) -> (Character, characterStart: String.Index)? { + ) -> (char: Character, index: String.Index)? { // FIXME: Sink into the stdlib to avoid multiple boundary calculations guard pos > start else { return nil } let previous = index(before: pos) if previous >= start { - return (self[pos], previous) + return (self[previous], previous) } // `start` must be a sub-character position that is between `pos` and the @@ -220,7 +220,7 @@ extension String { // boundary, but if it's in the middle of a scalar's code units, there // may not be a character to return at all after rounding down. Use // `Substring`'s rounding to determine what we can return. - let substr = self[start.. String.Index? { guard currentPosition > start else { return nil } - if case .definite(let result) = _quickReverseMatchAnyNonNewline( + if case .definite(let result) = _quickMatchPreviousAnyNonNewline( at: currentPosition, limitedBy: start, isScalarSemantics: isScalarSemantics ) { - assert(result == _thoroughReverseMatchAnyNonNewline( + assert(result == _thoroughMatchPreviousAnyNonNewline( at: currentPosition, limitedBy: start, isScalarSemantics: isScalarSemantics)) return result } - return _thoroughReverseMatchAnyNonNewline( + return _thoroughMatchPreviousAnyNonNewline( at: currentPosition, limitedBy: start, isScalarSemantics: isScalarSemantics) @@ -292,13 +292,13 @@ extension String { } @inline(__always) - private func _quickReverseMatchAnyNonNewline( + private func _quickMatchPreviousAnyNonNewline( at currentPosition: String.Index, limitedBy start: String.Index, isScalarSemantics: Bool ) -> QuickResult { assert(currentPosition > start) - guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + guard let (asciiValue, previous, isCRLF) = _quickPreviousASCIICharacter( at: currentPosition, limitedBy: start ) else { return .unknown @@ -332,22 +332,22 @@ extension String { } @inline(never) - private func _thoroughReverseMatchAnyNonNewline( + private func _thoroughMatchPreviousAnyNonNewline( at currentPosition: String.Index, limitedBy start: String.Index, isScalarSemantics: Bool ) -> String.Index? { if isScalarSemantics { guard currentPosition > start else { return nil } - let scalar = unicodeScalars[currentPosition] + let scalar = unicodeScalars[unicodeScalars.index(before: currentPosition)] guard !scalar.isNewline else { return nil } return unicodeScalars.index(before: currentPosition) } - guard let (char, previous) = characterAndStart(at: currentPosition, limitedBy: start), - !char.isNewline + guard let (previousCharacter, previousPosition) = character(before: currentPosition, limitedBy: start), + !previousCharacter.isNewline else { return nil } - return previous + return previousPosition } internal func matchRegexDot( @@ -382,7 +382,7 @@ extension String { before: currentPosition, isScalarSemantics: isScalarSemantics) } - return reverseMatchAnyNonNewline( + return matchPreviousAnyNonNewline( at: currentPosition, limitedBy: start, isScalarSemantics: isScalarSemantics) @@ -427,7 +427,7 @@ extension String { isScalarSemantics: isScalarSemantics) } - func reverseMatchBuiltinCC( + func matchPreviousBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, limitedBy start: String.Index, @@ -435,8 +435,8 @@ extension String { isStrictASCII: Bool, isScalarSemantics: Bool ) -> String.Index? { - guard currentPosition > start else { return nil } - if case .definite(let result) = _quickReverseMatchBuiltinCC( + guard currentPosition > start, currentPosition < endIndex else { return nil } + if case .definite(let result) = _quickMatchPreviousBuiltinCC( cc, at: currentPosition, limitedBy: start, @@ -444,7 +444,7 @@ extension String { isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics ) { - assert(result == _thoroughReverseMatchBuiltinCC( + assert(result == _thoroughMatchPreviousBuiltinCC( cc, at: currentPosition, limitedBy: start, @@ -453,7 +453,7 @@ extension String { isScalarSemantics: isScalarSemantics)) return result } - return _thoroughReverseMatchBuiltinCC( + return _thoroughMatchPreviousBuiltinCC( cc, at: currentPosition, limitedBy: start, @@ -485,8 +485,9 @@ extension String { return .definite(result == isInverted ? nil : next) } + /// Quick match a built in character class against the character before `currentPosition` @inline(__always) - private func _quickReverseMatchBuiltinCC( + private func _quickMatchPreviousBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, limitedBy start: String.Index, @@ -495,7 +496,7 @@ extension String { isScalarSemantics: Bool ) -> QuickResult { assert(currentPosition > start) - guard let (previous, result) = _quickReverseMatch( + guard let (previous, result) = _quickMatchPrevious( cc, at: currentPosition, limitedBy: start, @@ -506,7 +507,6 @@ extension String { return .definite(result == isInverted ? nil : previous) } - // TODO: JH - How can this be unit tested? // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) private func _thoroughMatchBuiltinCC( @@ -591,7 +591,7 @@ extension String { } @inline(never) - private func _thoroughReverseMatchBuiltinCC( + private func _thoroughMatchPreviousBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, limitedBy start: String.Index, @@ -601,19 +601,18 @@ extension String { ) -> String.Index? { // TODO: Branch here on scalar semantics // Don't want to pay character cost if unnecessary - guard let (char, previousIndex) = - characterAndStart(at: currentPosition, limitedBy: start) + guard var (previousChar, previousIndex) = + character(before: currentPosition, limitedBy: start) else { return nil } - var previous = previousIndex let scalar = unicodeScalars[currentPosition] let asciiCheck = !isStrictASCII || (scalar.isASCII && isScalarSemantics) - || char.isASCII + || previousChar.isASCII var matched: Bool if isScalarSemantics && cc != .anyGrapheme { - previous = unicodeScalars.index(before: currentPosition) + unicodeScalars.formIndex(before: &previousIndex) } switch cc { @@ -623,42 +622,42 @@ extension String { if isScalarSemantics { matched = scalar.properties.numericType != nil && asciiCheck } else { - matched = char.isNumber && asciiCheck + matched = previousChar.isNumber && asciiCheck } case .horizontalWhitespace: if isScalarSemantics { matched = scalar.isHorizontalWhitespace && asciiCheck } else { - matched = char._isHorizontalWhitespace && asciiCheck + matched = previousChar._isHorizontalWhitespace && asciiCheck } case .verticalWhitespace: if isScalarSemantics { matched = scalar.isNewline && asciiCheck } else { - matched = char._isNewline && asciiCheck + matched = previousChar._isNewline && asciiCheck } case .newlineSequence: if isScalarSemantics { matched = scalar.isNewline && asciiCheck if matched && scalar == "\r" - && previous >= start && unicodeScalars[previous] == "\n" { + && previousIndex >= start && unicodeScalars[previousIndex] == "\n" { // Match a full CR-LF sequence even in scalar semantics - unicodeScalars.formIndex(after: &previous) + unicodeScalars.formIndex(after: &previousIndex) } } else { - matched = char._isNewline && asciiCheck + matched = previousChar._isNewline && asciiCheck } case .whitespace: if isScalarSemantics { matched = scalar.properties.isWhitespace && asciiCheck } else { - matched = char.isWhitespace && asciiCheck + matched = previousChar.isWhitespace && asciiCheck } case .word: if isScalarSemantics { matched = scalar.properties.isAlphabetic && asciiCheck } else { - matched = char.isWordCharacter && asciiCheck + matched = previousChar.isWordCharacter && asciiCheck } } @@ -670,6 +669,6 @@ extension String { return nil } - return previous + return previousIndex } } diff --git a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift index f58e8ea2c..d8dde890d 100644 --- a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift @@ -189,7 +189,7 @@ extension String { } rangeStart = currentPosition currentPosition = previous - assert(currentPosition > rangeEnd) + assert(currentPosition < rangeStart) } guard numMatches >= minMatches else { @@ -302,7 +302,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchScalar( + matchPreviousScalar( scalar, at: currentPosition, limitedBy: start, @@ -325,7 +325,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchScalar( + matchPreviousScalar( scalar, at: currentPosition, limitedBy: start, @@ -351,7 +351,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchScalar( + matchPreviousScalar( scalar, at: currentPosition, limitedBy: start, @@ -378,7 +378,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchBuiltinCC( + matchPreviousBuiltinCC( builtinCC, at: currentPosition, limitedBy: start, @@ -404,7 +404,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchBuiltinCC( + matchPreviousBuiltinCC( builtinCC, at: currentPosition, limitedBy: start, @@ -433,7 +433,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchBuiltinCC( + matchPreviousBuiltinCC( builtinCC, at: currentPosition, limitedBy: start, diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 372a7e1b4..a7e05168f 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,68 +1,69 @@ extension Processor { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 var cycleCount: Int = 0 - var isTracingEnabled: Bool = false - var shouldMeasureMetrics: Bool = false + var isTracingEnabled: Bool = true + var shouldMeasureMetrics: Bool = true init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics +// self.isTracingEnabled = isTracingEnabled +// self.shouldMeasureMetrics = shouldMeasureMetrics } } -#else - struct ProcessorMetrics { - var isTracingEnabled: Bool { false } - var shouldMeasureMetrics: Bool { false } - var cycleCount: Int { 0 } - - init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } - } -#endif +//#else +// struct ProcessorMetrics { +// var isTracingEnabled: Bool { false } +// var shouldMeasureMetrics: Bool { false } +// var cycleCount: Int { 0 } +// +// init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } +// } +//#endif } extension Processor { mutating func startCycleMetrics() { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED if metrics.cycleCount == 0 { + print(instructions.map(\.description).joined(separator: "\n")) trace() measureMetrics() } -#endif +//#endif } mutating func endCycleMetrics() { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED metrics.cycleCount += 1 trace() measureMetrics() _checkInvariants() -#endif +//#endif } } extension Processor.ProcessorMetrics { mutating func addReset() { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED self.resets += 1 -#endif +//#endif } mutating func addBacktrack() { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED self.backtracks += 1 -#endif +//#endif } } extension Processor { -#if PROCESSOR_MEASUREMENTS_ENABLED +// #if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") print("Total cycle count: \(metrics.cycleCount)") @@ -92,5 +93,5 @@ extension Processor { measure() } } -#endif +//#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 79e732a70..80f067f4b 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -324,7 +324,7 @@ extension Processor { mutating func reverseMatch( _ e: Element, isCaseInsensitive: Bool ) -> Bool { - let previous = input.reverseMatch( + let previous = input.matchPrevious( e, at: currentPosition, limitedBy: start, @@ -390,7 +390,7 @@ extension Processor { boundaryCheck: Bool, isCaseInsensitive: Bool ) -> Bool { - let previous = input.reverseMatchScalar( + let previous = input.matchPreviousScalar( s, at: currentPosition, limitedBy: start, @@ -507,7 +507,7 @@ extension Processor { mutating func reverseMatchAnyNonNewline( isScalarSemantics: Bool ) -> Bool { - guard let previous = input.reverseMatchAnyNonNewline( + guard let previous = input.matchPreviousAnyNonNewline( at: currentPosition, limitedBy: start, isScalarSemantics: isScalarSemantics @@ -918,7 +918,8 @@ extension String { return next } - func reverseMatch( + // Match `char` to the character at the index before `pos` + func matchPrevious( _ char: Character, at pos: Index, limitedBy start: String.Index, @@ -926,15 +927,15 @@ extension String { ) -> Index? { // TODO: This can be greatly sped up with string internals // TODO: This is also very much quick-check-able - guard let (stringChar, next) = characterAndStart(at: pos, limitedBy: start) else { return nil } + guard let prev = character(before: pos, limitedBy: start) else { return nil } if isCaseInsensitive { - guard stringChar.lowercased() == char.lowercased() else { return nil } + guard prev.char.lowercased() == char.lowercased() else { return nil } } else { - guard stringChar == char else { return nil } + guard prev.char == char else { return nil } } - return next + return prev.index } func matchSeq( @@ -996,7 +997,7 @@ extension String { return idx } - func reverseMatchScalar( + func matchPreviousScalar( _ scalar: Unicode.Scalar, at pos: Index, limitedBy start: String.Index, @@ -1006,25 +1007,25 @@ extension String { // TODO: extremely quick-check-able // TODO: can be sped up with string internals guard pos > start else { return nil } - let curScalar = unicodeScalars[pos] + let prevIndex = unicodeScalars.index(before: pos) + let prevScalar = unicodeScalars[prevIndex] if isCaseInsensitive { - guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + guard prevScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping else { return nil } } else { - guard curScalar == scalar else { return nil } + guard prevScalar == scalar else { return nil } } - let idx = unicodeScalars.index(before: pos) - assert(idx >= start, "Input is a substring with a sub-scalar startIndex.") + assert(prevIndex >= start, "Input is a substring with a sub-scalar startIndex.") - if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + if boundaryCheck && !isOnGraphemeClusterBoundary(prevIndex) { return nil } - return idx + return prevIndex } func matchUTF8( @@ -1135,7 +1136,7 @@ extension String { // TODO: More fodder for refactoring `_quickASCIICharacter`, see the comment // there - guard let (asciiByte, previous, isCRLF) = _quickReverseASCIICharacter( + guard let (asciiByte, previous, isCRLF) = _quickPreviousASCIICharacter( at: pos, limitedBy: start ) else { @@ -1144,9 +1145,9 @@ extension String { guard bitset.matches(unicodeScalars[pos]) else { return nil } return unicodeScalars.index(before: pos) } else { - guard let (char, previous) = characterAndStart(at: pos, limitedBy: start), - bitset.matches(char) else { return nil } - return previous + guard let prev = character(before: pos, limitedBy: start), + bitset.matches(prev.char) else { return nil } + return prev.index } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 38c317f0e..5a90df528 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -211,9 +211,9 @@ extension Executor { extension Processor { fileprivate mutating func run() throws -> Input.Index? { -#if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } -#endif +// #if PROCESSOR_MEASUREMENTS_ENABLED + defer { if metrics.shouldMeasureMetrics { printMetrics() } } +//#endif if self.state == .fail { if let e = failureReason { throw e diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index efbe406e3..3e6816a3e 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -122,34 +122,35 @@ extension String { return (first: base, next: next, crLF: false) } + /// Get the ASCII character at the position before `idx` + /// /// TODO: better to take isScalarSemantics parameter, we can return more results /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics - func _quickReverseASCIICharacter( + func _quickPreviousASCIICharacter( at idx: Index, limitedBy start: Index - ) -> (first: UInt8, previous: Index, crLF: Bool)? { + ) -> (char: UInt8, index: Index, crLF: Bool)? { // TODO: fastUTF8 version assert(String.Index(idx, within: unicodeScalars) != nil) - assert(idx >= start) + assert(idx > start) - // If we're already at the start, there is no previous character - if idx == start { - return nil - } + // The index of the character we want to return + var previous = utf8.index(before: idx) - let char = utf8[idx] + // The character we want to return + let char = utf8[previous] guard char._isASCII else { - assert(!self[idx].isASCII) + assert(!self[previous].isASCII) return nil } - var previous = utf8.index(before: idx) if previous == start { - return (first: char, previous: previous, crLF: false) + // We've hit the start so there's no need to check for CR-LF + return (char: char, index: previous, crLF: false) } - let head = utf8[previous] + let head = utf8[utf8.index(before: previous)] guard head._isSub300StartingByte else { return nil } // Handle CR-LF by reversing past the sequence if both characters are present @@ -158,11 +159,11 @@ extension String { guard previous == start || utf8[previous]._isSub300StartingByte else { return nil } - return (first: char, previous: previous, crLF: true) + return (char: char, index: previous, crLF: true) } - assert(self[idx].isASCII && self[idx] != "\r\n") - return (first: char, previous: previous, crLF: false) + assert(self[previous].isASCII && self[previous] != "\r\n") + return (char: char, index: previous, crLF: false) } func _quickMatch( @@ -212,14 +213,14 @@ extension String { } } - func _quickReverseMatch( + func _quickMatchPrevious( _ cc: _CharacterClassModel.Representation, at idx: Index, limitedBy start: Index, isScalarSemantics: Bool ) -> (previous: Index, matchResult: Bool)? { /// ASCII fast-paths - guard let (asciiValue, previous, isCRLF) = _quickReverseASCIICharacter( + guard let (asciiValue, previousIndex, isCRLF) = _quickPreviousASCIICharacter( at: idx, limitedBy: start ) else { return nil @@ -228,34 +229,34 @@ extension String { // TODO: bitvectors switch cc { case .any, .anyGrapheme: - return (previous, true) + return (previousIndex, true) case .digit: - return (previous, asciiValue._asciiIsDigit) + return (previousIndex, asciiValue._asciiIsDigit) case .horizontalWhitespace: - return (previous, asciiValue._asciiIsHorizontalWhitespace) + return (previousIndex, asciiValue._asciiIsHorizontalWhitespace) case .verticalWhitespace, .newlineSequence: if asciiValue._asciiIsVerticalWhitespace { if isScalarSemantics && isCRLF && cc == .verticalWhitespace { - return (utf8.index(after: previous), true) + return (utf8.index(after: previousIndex), true) } - return (previous, true) + return (previousIndex, true) } - return (previous, false) + return (previousIndex, false) case .whitespace: if asciiValue._asciiIsWhitespace { if isScalarSemantics && isCRLF { - return (utf8.index(after: previous), true) + return (utf8.index(after: previousIndex), true) } - return (previous, true) + return (previousIndex, true) } - return (previous, false) + return (previousIndex, false) case .word: - return (previous, asciiValue._asciiIsWord) + return (previousIndex, asciiValue._asciiIsWord) } } } diff --git a/Tests/MatchingEngineTests/ASCIITests.swift b/Tests/MatchingEngineTests/ASCIITests.swift index 4af6bf28f..1c6c7527b 100644 --- a/Tests/MatchingEngineTests/ASCIITests.swift +++ b/Tests/MatchingEngineTests/ASCIITests.swift @@ -81,39 +81,28 @@ final class QuickASCIICharacterTests: XCTestCase { } } -final class QuickReverseASCIICharacterTests: XCTestCase { +final class QuickPreviousASCIICharacterTests: XCTestCase { func testHappyPath() throws { // Given let sut = "foo" let index = sut.index(after: sut.startIndex) // When - let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + let result = sut._quickPreviousASCIICharacter(at: index, limitedBy: sut.startIndex) // Then let (char, previousIdx, isCRLF) = try XCTUnwrap(result) - XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(char, sut.utf8[sut.utf8.startIndex]) XCTAssertEqual(previousIdx, sut.startIndex) XCTAssertFalse(isCRLF) } - func testAtStart() throws { - // Given - let sut = "foo" - - // When - let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) - - // Then - XCTAssertNil(result) - } - func testNonASCIIChar() throws { // Given - let sut = "é" + let sut = "éi" // When - let result = sut._quickReverseASCIICharacter(at: sut.startIndex, limitedBy: sut.startIndex) + let result = sut._quickPreviousASCIICharacter(at: sut.index(after: sut.startIndex), limitedBy: sut.startIndex) // Then XCTAssertNil(result) @@ -125,31 +114,31 @@ final class QuickReverseASCIICharacterTests: XCTestCase { let index = sut.index(after: sut.startIndex) // When - let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) + let result = sut._quickPreviousASCIICharacter(at: index, limitedBy: sut.startIndex) // Then let (char, previousIdx, isCRLF) = try XCTUnwrap(result) - XCTAssertEqual(char, sut.utf8[index]) + XCTAssertEqual(char, sut.utf8[sut.startIndex]) XCTAssertEqual(previousIdx, sut.startIndex) XCTAssertFalse(isCRLF) } // TODO: JH - Figure out how to test sub 300 starting bytes - func testIsCRLF() throws { - // Given - let sut = "foo\r\n" - // Start at '\n' - let index = sut.utf8.index(before: sut.endIndex) - - // When - let result = sut._quickReverseASCIICharacter(at: index, limitedBy: sut.startIndex) - - // Then - let (char, previousIndex, isCRLF) = try XCTUnwrap(result) - XCTAssertEqual(char, sut.utf8[index]) - XCTAssertEqual(previousIndex, sut.index(sut.startIndex, offsetBy: 2)) - XCTAssertTrue(isCRLF) - } + // FIXME: JH +// func testIsCRLF() throws { +// // Given +// let sut = "foo\r\nbar" +// +// // When +// let result = sut._quickPreviousASCIICharacter(at: sut.utf8.endIndex, limitedBy: sut.startIndex) +// +// // Then +// let (char, actualIndex, isCRLF) = try XCTUnwrap(result) +// let expectedIndex = sut.utf8.index(sut.utf8.endIndex, offsetBy: -2) +// XCTAssertEqual(char, sut.utf8[expectedIndex]) +// XCTAssertEqual(actualIndex, expectedIndex) +// XCTAssertTrue(isCRLF) +// } } final class ASCIIQuickMatchTests: XCTestCase { @@ -254,85 +243,87 @@ final class ASCIIQuickMatchTests: XCTestCase { } } -final class ASCIIQuickReverseMatchTests: XCTestCase { +final class ASCIIQuickMatchPreviousTests: XCTestCase { func testAny() throws { try _test(matching: .any, against: "1!") try _test(matching: .anyGrapheme, against: "1!") } func testDigit() throws { - try _test(matching: .digit, against: "a1") - try _test(matching: .digit, against: "1a", shouldMatch: false) + try _test(matching: .digit, against: "1a") + try _test(matching: .digit, against: "a1", shouldMatch: false) } func testHorizontalWhitespace() throws { - try _test(matching: .horizontalWhitespace, against: "a ") - try _test(matching: .horizontalWhitespace, against: "a\t") - try _test(matching: .horizontalWhitespace, against: "a\n", shouldMatch: false) + try _test(matching: .horizontalWhitespace, against: " b") + try _test(matching: .horizontalWhitespace, against: "\tb") + try _test(matching: .horizontalWhitespace, against: "\nb", shouldMatch: false) } func testVerticalWhitespace() throws { - try _test(matching: .verticalWhitespace, against: "a\n") - try _test(matching: .verticalWhitespace, against: "a\t", shouldMatch: false) + try _test(matching: .verticalWhitespace, against: "\nb") + try _test(matching: .verticalWhitespace, against: "\tb", shouldMatch: false) } - func testVerticalWhitespaceMatchesCRLF() throws { - let sut = "a\r\n" - - // When using scalar semantics: - // The next index should be the index of the "\n" character - try _test( - matching: .verticalWhitespace, - against: sut, - at: sut.utf8.index(before: sut.utf8.endIndex), - expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) - ) - - // When not using scalar semantics: - // The next index should be the index after the whole \r\n sequence (the end index) - try _test( - matching: .verticalWhitespace, - against: sut, - isScalarSemantics: false - ) - } + // FIXME: JH +// func testVerticalWhitespaceMatchesCRLF() throws { +// let sut = "a\r\nb" +// +// // When using scalar semantics: +// // The next index should be the index of the "\n" character +// try _test( +// matching: .verticalWhitespace, +// against: sut, +// at: sut.utf8.index(before: sut.utf8.endIndex), +// expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) +// ) +// +// // When not using scalar semantics: +// // The next index should be the index after the whole \r\n sequence (the end index) +// try _test( +// matching: .verticalWhitespace, +// against: sut, +// isScalarSemantics: false +// ) +// } func testWhitespace() throws { - try _test(matching: .whitespace, against: "a ") - try _test(matching: .whitespace, against: "a\t") - try _test(matching: .whitespace, against: "a\n") - try _test(matching: .whitespace, against: " a", shouldMatch: false) + try _test(matching: .whitespace, against: " a") + try _test(matching: .whitespace, against: "\ta") + try _test(matching: .whitespace, against: "\na") + try _test(matching: .whitespace, against: " ab", shouldMatch: false) } - func testWhitespaceCRLF() throws { - // Given - let sut = "a\r\n" - - // When using scalar semantics: - // The previous index should be the index of the "\r" character - try _test( - matching: .whitespace, - against: sut, - at: sut.utf8.index(before: sut.utf8.endIndex), - expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) - ) - - // When not using scalar semantics: - // The previous index should be the index before the whole \r\n sequence - // (the start index) - try _test( - matching: .whitespace, - against: sut, - isScalarSemantics: false - ) - } + // FIXME: JH +// func testWhitespaceCRLF() throws { +// // Given +// let sut = "a\r\n" +// +// // When using scalar semantics: +// // The previous index should be the index of the "\r" character +// try _test( +// matching: .whitespace, +// against: sut, +// at: sut.utf8.index(before: sut.utf8.endIndex), +// expectedPrevious: sut.utf8.firstIndex(of: ._carriageReturn) +// ) +// +// // When not using scalar semantics: +// // The previous index should be the index before the whole \r\n sequence +// // (the start index) +// try _test( +// matching: .whitespace, +// against: sut, +// isScalarSemantics: false +// ) +// } func testWord() throws { // Given - try _test(matching: .word, against: "!a") - try _test(matching: .word, against: "!1") - try _test(matching: .word, against: "!_") - try _test(matching: .word, against: "a-", shouldMatch: false) + try _test(matching: .word, against: "a!") + try _test(matching: .word, against: "1!") + try _test(matching: .word, against: "_!") + try _test(matching: .word, against: "-!", shouldMatch: false) } private func _test( @@ -344,9 +335,10 @@ final class ASCIIQuickReverseMatchTests: XCTestCase { expectedPrevious: String.Index? = nil ) throws { // When - let result = sut._quickReverseMatch( + let indexOrDefault = index ?? sut.index(before: sut.endIndex) + let result = sut._quickMatchPrevious( cc, - at: index ?? sut.index(before: sut.endIndex), + at: indexOrDefault, limitedBy: sut.startIndex, isScalarSemantics: isScalarSemantics ) @@ -354,6 +346,9 @@ final class ASCIIQuickReverseMatchTests: XCTestCase { // Then let (previous, matched) = try XCTUnwrap(result) XCTAssertEqual(matched, shouldMatch) - XCTAssertEqual(previous, expectedPrevious ?? sut.startIndex) + XCTAssertEqual( + previous, + expectedPrevious ?? sut.index(before: indexOrDefault) + ) } } diff --git a/Tests/MatchingEngineTests/MatchingEngineTests.swift b/Tests/MatchingEngineTests/MatchingEngineTests.swift index 8785b14e8..725584103 100644 --- a/Tests/MatchingEngineTests/MatchingEngineTests.swift +++ b/Tests/MatchingEngineTests/MatchingEngineTests.swift @@ -78,15 +78,15 @@ final class StringMatchingTests: XCTestCase { // MARK: characterAndStart tests func testCharacterAndStart_HappyPath() throws { // Given - let sut = "foo" + let sut = "bar" let pos = sut.index(before: sut.endIndex) // When - let result = sut.characterAndStart(at: pos, limitedBy: sut.startIndex) + let result = sut.character(before: pos, limitedBy: sut.startIndex) // Then let (char, previousIndex) = try XCTUnwrap(result) - XCTAssertEqual(char, "o") + XCTAssertEqual(char, "a") XCTAssertEqual(previousIndex, sut.index(before: pos)) } @@ -135,7 +135,7 @@ final class StringMatchingTests: XCTestCase { let sut = "foo" // When - let result = sut.characterAndStart(at: sut.startIndex, limitedBy: sut.startIndex) + let result = sut.character(before: sut.startIndex, limitedBy: sut.startIndex) // Then XCTAssertNil(result) @@ -195,7 +195,7 @@ final class StringMatchingTests: XCTestCase { XCTAssertNil(result) } - func testReverseMatchAnyNonNewline() throws { + func testMatchPreviousAnyNonNewline() throws { // Given // A string without any newline characters let sut = "bar" @@ -203,7 +203,7 @@ final class StringMatchingTests: XCTestCase { let pos = sut.index(before: sut.endIndex) // When we run the reverse match: - let result = sut.reverseMatchAnyNonNewline( + let result = sut.matchPreviousAnyNonNewline( at: pos, limitedBy: sut.startIndex, isScalarSemantics: true @@ -215,15 +215,15 @@ final class StringMatchingTests: XCTestCase { XCTAssertEqual(sut[previousIndex], "a") } - func testReverseMatchAnyNonNewline_Newline() throws { + func testMatchPreviousAnyNonNewline_Newline() throws { // Given // A string that has a newline character, let sut = "ba\nr" - // and the index of that newline character - let pos = try XCTUnwrap(sut.firstIndex(of: "\n")) + // and the index of the character after that newline + let pos = sut.index(sut.startIndex, offsetBy: 3) // When we run the reverse match: - let result = sut.reverseMatchAnyNonNewline( + let result = sut.matchPreviousAnyNonNewline( at: pos, limitedBy: sut.startIndex, isScalarSemantics: true @@ -233,13 +233,13 @@ final class StringMatchingTests: XCTestCase { XCTAssertNil(result) } - func testReverseMatchAnyNonNewline_atStart() throws { + func testMatchPreviousAnyNonNewline_atStart() throws { // Given // A string without any newline characters let sut = "bar" // When we try to reverse match starting at `startIndex`: - let result = sut.reverseMatchAnyNonNewline( + let result = sut.matchPreviousAnyNonNewline( at: sut.startIndex, limitedBy: sut.startIndex, isScalarSemantics: true @@ -396,31 +396,31 @@ extension StringMatchingTests { // MARK: reverseMatchScalar tests extension StringMatchingTests { - func testReverseMatchScalar() { + func testMatchPreviousScalar() { // Given let sut = "bar" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "a", - at: sut.index(after: sut.startIndex), + at: sut.index(before: sut.endIndex), limitedBy: sut.startIndex, boundaryCheck: false, isCaseInsensitive: false ) // Then - XCTAssertEqual(previous, sut.startIndex) + XCTAssertEqual(previous, sut.index(after: sut.startIndex)) } - func testReverseMatchScalarNoMatch() { + func testMatchPreviousScalarNoMatch() { // Given let sut = "bar" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "b", - at: sut.index(after: sut.startIndex), + at: sut.index(before: sut.endIndex), limitedBy: sut.startIndex, boundaryCheck: false, isCaseInsensitive: false @@ -430,31 +430,31 @@ extension StringMatchingTests { XCTAssertNil(previous) } - func testReverseMatchScalarCaseInsensitive() { + func testMatchPreviousScalarCaseInsensitive() { // Given let sut = "BAR" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "a", - at: sut.index(after: sut.startIndex), + at: sut.index(before: sut.endIndex), limitedBy: sut.startIndex, boundaryCheck: false, isCaseInsensitive: true ) // Then - XCTAssertEqual(previous, sut.startIndex) + XCTAssertEqual(previous, sut.index(after: sut.startIndex)) } - func testReverseMatchScalarCaseInsensitiveNoMatch() { + func testMatchPreviousScalarCaseInsensitiveNoMatch() { // Given let sut = "BAR" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "b", - at: sut.index(after: sut.startIndex), + at: sut.index(before: sut.endIndex), limitedBy: sut.startIndex, boundaryCheck: false, isCaseInsensitive: true @@ -464,12 +464,12 @@ extension StringMatchingTests { XCTAssertNil(previous) } - func testReverseMatchScalarAtStart() { + func testMatchPreviousScalarAtStart() { // Given let sut = "a" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "a", at: sut.startIndex, limitedBy: sut.startIndex, @@ -482,13 +482,13 @@ extension StringMatchingTests { } // TODO: JH - Write test for when the boundary check passes/check if that's already covered - func testReverseMatchScalarFailsBoundaryCheck() { + func testMatchPreviousScalarFailsBoundaryCheck() { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} let sut = "ab̖̀cd" // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "\u{316}", at: sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3), limitedBy: sut.startIndex, @@ -500,14 +500,14 @@ extension StringMatchingTests { XCTAssertNil(previous) } - func testReverseMatchScalarNoBoundaryCheck() { + func testMatchPreviousScalarNoBoundaryCheck() { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} let sut = "ab̖̀cd" - let startPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 3) + let startPos = sut.unicodeScalars.index(sut.unicodeScalars.startIndex, offsetBy: 4) // When - let previous = sut.reverseMatchScalar( + let previous = sut.matchPreviousScalar( "\u{316}", at: startPos, limitedBy: sut.startIndex, @@ -611,11 +611,11 @@ extension StringMatchingTests { // MARK: reverseMatchUTF8 tests extension StringMatchingTests { - func testReverseMatchUTF8() { + func testMatchPreviousUTF8() { // Given let sut = "quotedliteral" let needle = Array(sut.suffix(3).utf8) - + // When let previous = sut.reverseMatchUTF8( needle, @@ -623,16 +623,16 @@ extension StringMatchingTests { limitedBy: sut.startIndex, boundaryCheck: false ) - + // Then XCTAssertEqual(previous, sut.index(sut.endIndex, offsetBy: -4)) } - - func testReverseMatchUTF8NoMatch() { + + func testMatchPreviousUTF8NoMatch() { // Given let haystack = "quotedliteral" let needle = Array("\(haystack.suffix(2))a".utf8) - + // When let previous = haystack.reverseMatchUTF8( needle, @@ -640,16 +640,16 @@ extension StringMatchingTests { limitedBy: haystack.startIndex, boundaryCheck: false ) - + // Then XCTAssertNil(previous) } - - func testReverseMatchUTF8MatchPastStart() { + + func testMatchPreviousUTF8MatchPastStart() { // Given let haystack = "quotedliteral" let needle = Array(haystack.suffix(3).utf8) - + // When let previous = haystack.reverseMatchUTF8( needle, @@ -657,18 +657,18 @@ extension StringMatchingTests { limitedBy: haystack.index(haystack.unicodeScalars.endIndex, offsetBy: -2), boundaryCheck: false ) - + // Then XCTAssertNil(previous) } - + // TODO: JH - Write test for when the boundary check passes/check if that's already covered - func testReverseMatchUTF8FailsBoundaryCheck() { + func testMatchPreviousUTF8FailsBoundaryCheck() { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} let sut = "ab̖̀cd" let needle = Array("\u{316}".utf8) - + // When let previous = sut.reverseMatchUTF8( needle, @@ -676,12 +676,12 @@ extension StringMatchingTests { limitedBy: sut.startIndex, boundaryCheck: true ) - + // Then XCTAssertNil(previous) } - - func testReverseMatchUTF8NoBoundaryCheck() throws { + + func testMatchPreviousUTF8NoBoundaryCheck() throws { // Given // \u{61}\u{62}\u{300}\u{316}\u{63}\u{64} // utf8 = [97, 98, 204, 128, 204, 150, 99, 100] @@ -690,7 +690,7 @@ extension StringMatchingTests { let needle = Array("\u{316}".utf8) // Position of \u{316} = 5[utf8] let startPos = sut.utf8.index(sut.utf8.endIndex, offsetBy: -3) - + // When let previous = sut.reverseMatchUTF8( needle, @@ -698,7 +698,7 @@ extension StringMatchingTests { limitedBy: sut.startIndex, boundaryCheck: false ) - + // Then // TODO: JH - Is there a better way to write this assertion? // Previous should be the second byte of \u{300} diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index ae739dd7f..5774cf028 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1614,8 +1614,8 @@ extension RegexTests { #"(*positive_lookbehind:USD)\d+"#, input: "Price: USD100", match: "100") - firstMatchTest( - #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100") +// firstMatchTest( +// #"\d{3}(?<=USD\d{3})"#, input: "Price: USD100", match: "100") firstMatchTest( #"(? Date: Tue, 1 Jul 2025 20:56:37 -0500 Subject: [PATCH 8/8] Fix assertions in lookbehinds --- .../_StringProcessing/Engine/MEBuiltins.swift | 4 +++- .../Engine/MEReverseQuantify.swift | 6 +++--- Sources/_StringProcessing/Engine/Processor.swift | 11 ++++++----- Sources/_StringProcessing/Regex/DSLTree.swift | 4 ++++ Sources/_StringProcessing/Unicode/ASCII.swift | 12 ++++++------ Tests/RegexTests/MatchTests.swift | 16 ++++++++-------- 6 files changed, 30 insertions(+), 23 deletions(-) diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index e28a33fe8..d7a38aa85 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -36,7 +36,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard currentPosition >= start, let previous = input.matchPreviousBuiltinCC( + guard let previous = input.matchPreviousBuiltinCC( cc, at: currentPosition, limitedBy: start, @@ -182,6 +182,7 @@ extension String { : (substr.first!, substr.endIndex) } + // TODO: JH - Fix this docu /// Returns the character before `pos`, bounded by `start`, as well as that /// character's index. /// @@ -215,6 +216,7 @@ extension String { return (self[previous], previous) } + // TODO: JH - Verify this works as expected // `start` must be a sub-character position that is between `pos` and the // next grapheme boundary. This is okay if `start` is on a Unicode scalar // boundary, but if it's in the middle of a scalar's code units, there diff --git a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift index d8dde890d..5e27f58a0 100644 --- a/Sources/_StringProcessing/Engine/MEReverseQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEReverseQuantify.swift @@ -232,7 +232,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchASCIIBitset( + matchPreviousASCIIBitset( asciiBitset, at: currentPosition, limitedBy: start, @@ -254,7 +254,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchASCIIBitset( + matchPreviousASCIIBitset( asciiBitset, at: currentPosition, limitedBy: start, @@ -279,7 +279,7 @@ extension String { produceSavePointRange: produceSavePointRange, isScalarSemantics: isScalarSemantics ) { currentPosition, start, isScalarSemantics in - reverseMatchASCIIBitset( + matchPreviousASCIIBitset( asciiBitset, at: currentPosition, limitedBy: start, diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 80f067f4b..6b3203737 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -474,7 +474,7 @@ extension Processor { _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, isScalarSemantics: Bool ) -> Bool { - guard let previous = input.reverseMatchASCIIBitset( + guard let previous = input.matchPreviousASCIIBitset( bitset, at: currentPosition, limitedBy: start, @@ -1120,7 +1120,7 @@ extension String { return next } - func reverseMatchASCIIBitset( + func matchPreviousASCIIBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, at pos: Index, limitedBy start: Index, @@ -1141,9 +1141,10 @@ extension String { limitedBy: start ) else { if isScalarSemantics { - guard pos >= start else { return nil } - guard bitset.matches(unicodeScalars[pos]) else { return nil } - return unicodeScalars.index(before: pos) + guard pos > start else { return nil } + let matchPos = unicodeScalars.index(before: pos) + guard bitset.matches(unicodeScalars[matchPos]) else { return nil } + return matchPos } else { guard let prev = character(before: pos, limitedBy: start), bitset.matches(prev.char) else { return nil } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 6b20d5e17..d350b6b8c 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -791,6 +791,10 @@ extension DSLTree.Node { // Groups (and other parent nodes) defer to the child. case .nonCapturingGroup(let kind, let child): + // FIXME: JH - There are lookbehinds that we can definitively tell can only match at the start. Figure that out and implement it. Ex: (?<=^)abc while silly, is an example of this. There may be others + guard kind.ast != .lookbehind, kind.ast != .negativeLookbehind else { + return false + } options.beginScope() defer { options.endScope() } if case .changeMatchingOptions(let sequence) = kind.ast { diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 3e6816a3e..d2c5cb56b 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -139,31 +139,31 @@ extension String { var previous = utf8.index(before: idx) // The character we want to return - let char = utf8[previous] - guard char._isASCII else { + let previousChar = utf8[previous] + guard previousChar._isASCII else { assert(!self[previous].isASCII) return nil } if previous == start { // We've hit the start so there's no need to check for CR-LF - return (char: char, index: previous, crLF: false) + return (char: previousChar, index: previous, crLF: false) } let head = utf8[utf8.index(before: previous)] guard head._isSub300StartingByte else { return nil } // Handle CR-LF by reversing past the sequence if both characters are present - if char == ._lineFeed && head == ._carriageReturn { + if previousChar == ._lineFeed && head == ._carriageReturn { utf8.formIndex(before: &previous) guard previous == start || utf8[previous]._isSub300StartingByte else { return nil } - return (char: char, index: previous, crLF: true) + return (char: previousChar, index: previous, crLF: true) } assert(self[previous].isASCII && self[previous] != "\r\n") - return (char: char, index: previous, crLF: false) + return (char: previousChar, index: previous, crLF: false) } func _quickMatch( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 5774cf028..1325cffd8 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1643,14 +1643,14 @@ extension RegexTests { match: "suffix" ) -// firstMatchTests( -// #"(?<=^\d{1,3})abc"#, -// ("123abc", "abc"), -// ("12abc", "abc"), -// ("1abc", "abc"), -// ("1234abc", nil), // FIXME: Shouldn't match but does because `^` assertions are broken -// ("z123abc", nil) // FIXME: Same as above -// ) + firstMatchTests( + #"(?<=^\d{1,3})abc"#, + ("123abc", "abc"), + ("12abc", "abc"), + ("1abc", "abc"), + ("1234abc", nil), + ("z123abc", nil) + ) // firstMatchTest(#"abcd(?<=c(?=d)d)"#, input: "abcdefg", match: "abcd") // firstMatchTest(#"abcd(?<=cd(?=d).)"#, input: "abcdefg", match: nil)