From 60b4d78ac3bd5d148bb339520500020fabe2bcb1 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Apr 2023 16:38:15 -0500 Subject: [PATCH 1/4] Remove most consumer functions This is based heavily off the work in #590, rebased onto main, with some changes to remove even more consumer uses. Consumer functions only have two remaining uses: non-ASCII ranges and Unicode lookups (for things like general category, binary properties, name, etc.). This change primarily treats custom character classes as alternations around their contents, with set operations emitted as instructions instead of implemented via consumer function. --- Sources/_StringProcessing/ByteCodeGen.swift | 296 +++++++++++++++++- .../_StringProcessing/ConsumerInterface.swift | 267 +--------------- .../Engine/Instruction.swift | 8 + .../_StringProcessing/Engine/MEBuilder.swift | 8 + .../_StringProcessing/Engine/Processor.swift | 4 + Sources/_StringProcessing/Regex/DSLTree.swift | 28 ++ Tests/RegexTests/CompileTests.swift | 41 +++ 7 files changed, 378 insertions(+), 274 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d4c91bd63..9f2940f6c 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -243,9 +243,11 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitAlternation( - _ children: [DSLTree.Node] - ) throws { + mutating func emitAlternationGen( + _ elements: C, + withBacktracking: Bool, + _ body: (inout Compiler.ByteCodeGen, C.Element) throws -> Void + ) rethrows { // Alternation: p0 | p1 | ... | pn // save next_p1 // @@ -263,16 +265,27 @@ fileprivate extension Compiler.ByteCodeGen { // // done: let done = builder.makeAddress() - for component in children.dropLast() { + for element in elements.dropLast() { let next = builder.makeAddress() builder.buildSave(next) - try emitNode(component) + try body(&self, element) + if !withBacktracking { + builder.buildClear() + } builder.buildBranch(to: done) builder.label(next) } - try emitNode(children.last!) + try body(&self, elements.last!) builder.label(done) } + + mutating func emitAlternation( + _ children: [DSLTree.Node] + ) throws { + try emitAlternationGen(children, withBacktracking: true) { + try $0.emitNode($1) + } + } mutating func emitConcatenationComponent( _ node: DSLTree.Node @@ -828,6 +841,36 @@ fileprivate extension Compiler.ByteCodeGen { } } + /// Flatten quoted strings into sequences of atoms, so that the standard + /// CCC codegen will handle them. + func flatteningCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + var characters: Set = [] + var scalars: Set = [] + var result: [DSLTree.CustomCharacterClass.Member] = [] + for member in members { + switch member { + case .atom(let atom): + switch atom { + case let .char(char): + characters.insert(char) + case let .scalar(scalar): + scalars.insert(scalar) + default: + result.append(member) + } + case let .quotedLiteral(str): + characters.formUnion(str) + default: + result.append(member) + } + } + result.append(contentsOf: characters.map { .atom(.char($0)) }) + result.append(contentsOf: scalars.map { .atom(.scalar($0)) }) + return result + } + func coalescingCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) -> DSLTree.CustomCharacterClass { @@ -835,12 +878,150 @@ fileprivate extension Compiler.ByteCodeGen { // mode, we don't want to coalesce any scalars into a grapheme. This // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and // U+302. - guard options.semanticLevel == .graphemeCluster else { return ccc } - - let members = coalescingCustomCharacterClassMembers(ccc.members) - return .init(members: members, isInverted: ccc.isInverted) + let members = options.semanticLevel == .graphemeCluster + ? coalescingCustomCharacterClassMembers(ccc.members) + : ccc.members + return .init( + members: flatteningCustomCharacterClassMembers(members), + isInverted: ccc.isInverted) } + mutating func emitCharacterInCCC(_ c: Character) { + switch options.semanticLevel { + case .graphemeCluster: + emitCharacter(c) + case .unicodeScalar: + // When in scalar mode, act like an alternation of the individual scalars + // that comprise a character. + emitAlternationGen(c.unicodeScalars, withBacktracking: false) { + $0.emitMatchScalar($1) + } + } + } + + mutating func emitCCCMember( + _ member: DSLTree.CustomCharacterClass.Member + ) throws { + switch member { + case .atom(let atom): + switch atom { + case .char(let c): + emitCharacterInCCC(c) + case .scalar(let s): + emitCharacterInCCC(Character(s)) + default: + try emitAtom(atom) + } + case .custom(let ccc): + try emitCustomCharacterClass(ccc) + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") + case .range: + let consumer = try member.generateConsumer(options) + builder.buildConsume(by: consumer) + case .trivia: + return + + // TODO: Can we decide when it's better to try `rhs` first? + // Intersection is trivial, since failure on either side propagates: + // - store current position + // - lhs + // - restore current position + // - rhs + case let .intersection(lhs, rhs): + let r = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) + + // TODO: Can we decide when it's better to try `rhs` first? + // For subtraction, failure in `lhs` propagates, while failure in `rhs` is + // swallowed/reversed: + // - store current position + // - lhs + // - save to end + // - restore current position + // - rhs + // - clear, fail (since both succeeded) + // - end: ... + case let .subtraction(lhs, rhs): + let r = builder.makePositionRegister() + let end = builder.makeAddress() + builder.buildMoveCurrentPosition(into: r) + try emitCustomCharacterClass(lhs) // no match here = failure, propagates + builder.buildSave(end) + builder.buildRestorePosition(from: r) + try emitCustomCharacterClass(rhs) // no match here = success, resumes at 'end' + builder.buildClear() // clears 'end' + builder.buildFail() // this failure propagates outward + builder.label(end) + + // Symmetric difference always requires executing both `rhs` and `lhs`. + // Execute each, ignoring failure and storing the resulting position in a + // register. If those results are equal, fail. If they're different, use + // the position that is different from the starting position: + // - store current position as r0 + // - save to lhsFail + // - lhs + // - clear lhsFail (and continue) + // - lhsFail: save position as r1 + // + // - restore current position + // - save to rhsFail + // - rhs + // - clear rhsFail (and continue) + // - rhsFail: save position as r2 + // + // - restore to resulting position from lhs (r1) + // - if equal to r2, goto fail (both sides had same result) + // - if equal to r0, goto advance (lhs failed) + // - goto end + // - advance: restore to resulting position from rhs (r2) + // - goto end + // - fail: fail + // - end: ... + case let .symmetricDifference(lhs, rhs): + let r0 = builder.makePositionRegister() + let r1 = builder.makePositionRegister() + let r2 = builder.makePositionRegister() + let lhsFail = builder.makeAddress() + let rhsFail = builder.makeAddress() + let advance = builder.makeAddress() + let fail = builder.makeAddress() + let end = builder.makeAddress() + + builder.buildMoveCurrentPosition(into: r0) + builder.buildSave(lhsFail) + try emitCustomCharacterClass(lhs) + builder.buildClear() + builder.label(lhsFail) + builder.buildMoveCurrentPosition(into: r1) + + builder.buildRestorePosition(from: r0) + builder.buildSave(rhsFail) + try emitCustomCharacterClass(rhs) + builder.buildClear() + builder.label(rhsFail) + builder.buildMoveCurrentPosition(into: r2) + + // If r1 == r2, then fail + builder.buildRestorePosition(from: r1) + builder.buildCondBranch(to: fail, ifSamePositionAs: r2) + + // If r1 == r0, then move to r2 before ending + builder.buildCondBranch(to: advance, ifSamePositionAs: r0) + builder.buildBranch(to: end) + builder.label(advance) + builder.buildRestorePosition(from: r2) + builder.buildBranch(to: end) + + builder.label(fail) + builder.buildFail() + builder.label(end) + } + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { @@ -858,8 +1039,67 @@ fileprivate extension Compiler.ByteCodeGen { } return } - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + + let updatedCCC: DSLTree.CustomCharacterClass + if optimizationsEnabled { + updatedCCC = ccc.coalescingASCIIMembers(options) + } else { + updatedCCC = ccc + } + let filteredMembers = updatedCCC.members.filter({!$0.isOnlyTrivia}) + + if updatedCCC.isInverted { + // inverted + // custom character class: p0 | p1 | ... | pn + // Try each member to make sure they all fail + // save next_p1 + // + // clear, fail + // next_p1: + // save next_p2 + // + // clear fail + // next_p2: + // save next_p... + // + // clear fail + // ... + // next_pn: + // save done + // + // clear fail + // done: + // step forward by 1 + let done = builder.makeAddress() + for member in filteredMembers.dropLast() { + let next = builder.makeAddress() + builder.buildSave(next) + try emitCCCMember(member) + builder.buildClear() + builder.buildFail() + builder.label(next) + } + builder.buildSave(done) + try emitCCCMember(filteredMembers.last!) + builder.buildClear() + builder.buildFail() + builder.label(done) + + // Consume a single unit for the inverted ccc + switch options.semanticLevel { + case .graphemeCluster: + builder.buildAdvance(1) + case .unicodeScalar: + builder.buildAdvanceUnicodeScalar(1) + } + return + } + // non inverted CCC + // Custom character class: p0 | p1 | ... | pn + // Very similar to alternation, but we don't keep backtracking save points + try emitAlternationGen(filteredMembers, withBacktracking: false) { + try $0.emitCCCMember($1) + } } mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { @@ -996,6 +1236,12 @@ fileprivate extension Compiler.ByteCodeGen { } extension DSLTree.Node { + /// A Boolean value indicating whether this node advances the match position + /// on a successful match. + /// + /// For example, an alternation like `(a|b|c)` always advances the position + /// by a character, but `(a|b|)` has an empty branch, which matches without + /// advancing. var guaranteesForwardProgress: Bool { switch self { case .orderedChoice(let children): @@ -1026,8 +1272,8 @@ extension DSLTree.Node { case .consumer, .matcher: // Allow zero width consumers and matchers return false - case .customCharacterClass: - return true + case .customCharacterClass(let ccc): + return ccc.guaranteesForwardProgress case .quantification(let amount, _, let child): let (atLeast, _) = amount.ast.bounds return atLeast ?? 0 > 0 && child.guaranteesForwardProgress @@ -1035,3 +1281,25 @@ extension DSLTree.Node { } } } + +extension DSLTree.CustomCharacterClass { + /// We allow trivia into CustomCharacterClass, which could result in a CCC + /// that matches nothing, ie `(?x)[ ]`. + var guaranteesForwardProgress: Bool { + for m in members { + switch m { + case .trivia: + continue + case let .intersection(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + case let .subtraction(lhs, _): + return lhs.guaranteesForwardProgress + case let .symmetricDifference(lhs, rhs): + return lhs.guaranteesForwardProgress && rhs.guaranteesForwardProgress + default: + return true + } + } + return false + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 705b354fb..fb4074b52 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -18,87 +18,12 @@ extension Character { } } -extension DSLTree.Node { - /// Attempt to generate a consumer from this AST node - /// - /// A consumer is a Swift closure that matches against - /// the front of an input range - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case .atom(let a): - return try a.generateConsumer(opts) - case .customCharacterClass(let ccc): - return try ccc.generateConsumer(opts) - - case .quotedLiteral: - // TODO: Should we handle this here? - return nil - - case let .convertedRegexLiteral(n, _): - return try n.generateConsumer(opts) - - case .orderedChoice, .conditional, .concatenation, - .capture, .nonCapturingGroup, - .quantification, .trivia, .empty, - .ignoreCapturesInTypedOutput, .absentFunction: return nil - - case .consumer: - fatalError("FIXME: Is this where we handle them?") - case .matcher: - fatalError("FIXME: Is this where we handle them?") - case .characterPredicate: - fatalError("FIXME: Is this where we handle them?") - } - } -} - extension DSLTree._AST.Atom { var singleScalarASCIIValue: UInt8? { return ast.singleScalarASCIIValue } } -extension Character { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - let isCaseInsensitive = opts.isCaseInsensitive - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && isCased { - return input[low].lowercased() == lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == self - ? input.index(after: low) - : nil - } - } - case .unicodeScalar: - // TODO: This should only be reachable from character class emission, can - // we guarantee that? Otherwise we'd want a different matching behavior. - let consumers = unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } - } -} - extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { @@ -112,85 +37,6 @@ extension DSLTree.Atom { return nil } } - - // TODO: If ByteCodeGen switches first, then this is unnecessary for - // top-level nodes, but it's also invoked for `.atom` members of a custom CC - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { - switch self { - case let .char(c): - return try c.generateConsumer(opts) - - case let .scalar(s): - // A scalar always matches the same as a single scalar character. This - // means it must match a whole grapheme in grapheme semantic mode, but - // can match a single scalar in scalar semantic mode. - return try Character(s).generateConsumer(opts) - - case .any: - // FIXME: Should this be a total ordering? - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - input.index(after: bounds.lowerBound) - } - } else { - return consumeScalar { _ in - true - } - } - - case .anyNonNewline: - switch opts.semanticLevel { - case .graphemeCluster: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) - } - case .unicodeScalar: - return { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.unicodeScalars.index(after: bounds.lowerBound) - } - } - - case .dot: - throw Unreachable(".atom(.dot) should be handled by emitDot") - - case .assertion: - // TODO: We could handle, should this be total? - return nil - case .characterClass(let cc): - return cc.generateConsumer(opts) - - case .backreference: - // TODO: Should we handle? - return nil - - case .symbolicReference: - // TODO: Should we handle? - return nil - - case .changeMatchingOptions: - // TODO: Should we handle? - return nil - - case let .unconverted(a): - return try a.ast.generateConsumer(opts) - } - - } -} - -extension DSLTree.Atom.CharacterClass { - func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction { - let model = asRuntimeModel(opts) - return { input, bounds in - model.matches(in: input, at: bounds.lowerBound) - } - } } extension String { @@ -281,47 +127,20 @@ extension AST.Atom { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { switch kind { - case let .scalar(s): - assertionFailure( - "Should have been handled by tree conversion") - return consumeScalar { $0 == s.value } - - case let .char(c): - assertionFailure( - "Should have been handled by tree conversion") - - // TODO: Match level? - return { input, bounds in - let low = bounds.lowerBound - guard input[low] == c else { - return nil - } - return input.index(after: low) - } - case let .property(p): return try p.generateConsumer(opts) case let .namedCharacter(name): return consumeName(name, opts: opts) - case .dot: - assertionFailure( - "Should have been handled by tree conversion") - fatalError(".atom(.dot) is handled in emitDot") - - case .caretAnchor, .dollarAnchor: - // handled in emitAssertion - return nil - case .escaped: - // handled in emitAssertion and emitCharacterClass - return nil - case .scalarSequence, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil + + default: + fatalError("Handled in ByteCodeGen or earlier.") } } } @@ -360,11 +179,6 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { switch self { - case let .atom(a): - guard let c = try a.generateConsumer(opts) else { - throw Unsupported("Consumer for \(a)") - } - return c case let .range(low, high): guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") @@ -430,57 +244,12 @@ extension DSLTree.CustomCharacterClass.Member { return nil } - case let .custom(ccc): - return try ccc.generateConsumer(opts) - - case let .intersection(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - let rhsIdx = rhs(input, bounds) - { - guard lhsIdx == rhsIdx else { - fatalError("TODO: What should we do here?") - } - return lhsIdx - } - return nil - } + case .atom, .custom, .intersection, .subtraction, .symmetricDifference: + fatalError("Handled in 'emitCustomCharacterClass'") - case let .subtraction(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds), - rhs(input, bounds) == nil - { - return lhsIdx - } - return nil - } + case .quotedLiteral: + fatalError("Removed in 'flatteningCustomCharacterClassMembers'") - case let .symmetricDifference(lhs, rhs): - let lhs = try lhs.generateConsumer(opts) - let rhs = try rhs.generateConsumer(opts) - return { input, bounds in - if let lhsIdx = lhs(input, bounds) { - return rhs(input, bounds) == nil ? lhsIdx : nil - } - return rhs(input, bounds) - } - case .quotedLiteral(let str): - let consumers = try str.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } case .trivia: // TODO: Should probably strip this earlier... return { _, _ in nil } @@ -501,28 +270,6 @@ extension DSLTree.CustomCharacterClass { } ) } - - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - return opts.semanticLevel == .graphemeCluster - ? input.index(after: bounds.lowerBound) - : input.unicodeScalars.index(after: bounds.lowerBound) - } - return nil - } - } } // NOTE: Conveniences, though not most performant diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 21ab90a03..011174660 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -44,6 +44,14 @@ extension Instruction { /// Operands: /// - Position register to move into case moveCurrentPosition + + /// Set the current position to the value stored in the register + /// + /// restorePosition(from: PositionRegister) + /// + /// Operands: + /// - Position register to read from + case restorePosition // MARK: General Purpose: Control flow diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 4b623fbda..f28e3c77e 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -81,6 +81,10 @@ extension MEProgram.Builder { .init(instructions.endIndex - 1) } + mutating func buildFatalError() { + instructions.append(.init(.invalid)) + } + mutating func buildMoveImmediate( _ value: UInt64, into: IntRegister ) { @@ -308,6 +312,10 @@ extension MEProgram.Builder { instructions.append(.init(.moveCurrentPosition, .init(position: r))) } + mutating func buildRestorePosition(from r: PositionRegister) { + instructions.append(.init(.restorePosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister, isScalarMode: Bool diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 98aee7803..b60d28d92 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -465,6 +465,10 @@ extension Processor { let reg = payload.position registers[reg] = currentPosition controller.step() + case .restorePosition: + let reg = payload.position + currentPosition = registers[reg] + controller.step() case .branch: controller.pc = payload.addr diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index b784e2382..cdd1a6738 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -131,6 +131,23 @@ extension DSLTree { } } + func coalescingASCIIMembers(_ opts: MatchingOptions) -> CustomCharacterClass { + var ascii: [Member] = [] + var nonAscii: [Member] = [] + for member in members { + if member.asAsciiBitset(opts, false) != nil { + ascii.append(member) + } else { + nonAscii.append(member) + } + } + if ascii.isEmpty || nonAscii.isEmpty { return self } + return CustomCharacterClass(members: [ + .custom(CustomCharacterClass(members: ascii)), + .custom(CustomCharacterClass(members: nonAscii)) + ], isInverted: isInverted) + } + public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) { self.members = members self.isInverted = isInverted @@ -161,6 +178,17 @@ extension DSLTree { indirect case intersection(CustomCharacterClass, CustomCharacterClass) indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) + + var isOnlyTrivia: Bool { + switch self { + case .custom(let ccc): + return ccc.members.all(\.isOnlyTrivia) + case .trivia: + return true + default: + return false + } + } } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 752921e19..b44cace41 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -19,6 +19,7 @@ enum DecodedInstr { case invalid case moveImmediate case moveCurrentPosition + case restorePosition case branch case condBranchZeroElseDecrement case condBranchSamePosition @@ -65,6 +66,8 @@ extension DecodedInstr { return .moveImmediate case .moveCurrentPosition: return .moveCurrentPosition + case .restorePosition: + return .restorePosition case .branch: return .branch case .condBranchZeroElseDecrement: @@ -368,6 +371,44 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: "[a-c]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[a-á]", + contains: [.consumeBy], + doesNotContain: [.matchBitset, .matchBitsetScalar]) + expectProgram( + for: "[a-fá-ém-zk]", + contains: [.matchBitset, .consumeBy], + doesNotContain: [.matchBitsetScalar]) + expectProgram( + for: "[a-c0123]", + contains: [.matchBitset], + doesNotContain: [.matchBitsetScalar, .consumeBy]) + expectProgram( + for: #"\w"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\w"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"[\w]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBuiltin], + doesNotContain: [.consumeBy, .matchBitset, .matchBitsetScalar]) + expectProgram( + for: #"\p{Greek}"#, + contains: [.consumeBy], + doesNotContain: [.matchBuiltin, .matchBitset, .matchBitsetScalar]) } func testScalarOptimizeCompilation() { From ded0d1b93af1119c0ae3ce18710269efced874a3 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 13 Apr 2023 22:34:16 -0500 Subject: [PATCH 2/4] Require new stdlib for range tests --- Tests/RegexTests/CompileTests.swift | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index b44cace41..b90f8dad2 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -375,14 +375,6 @@ extension RegexTests { for: "[a-c]", contains: [.matchBitset], doesNotContain: [.consumeBy, .matchBitsetScalar]) - expectProgram( - for: "[a-á]", - contains: [.consumeBy], - doesNotContain: [.matchBitset, .matchBitsetScalar]) - expectProgram( - for: "[a-fá-ém-zk]", - contains: [.matchBitset, .consumeBy], - doesNotContain: [.matchBitsetScalar]) expectProgram( for: "[a-c0123]", contains: [.matchBitset], @@ -409,6 +401,18 @@ extension RegexTests { for: #"\p{Greek}"#, contains: [.consumeBy], doesNotContain: [.matchBuiltin, .matchBitset, .matchBitsetScalar]) + + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + + expectProgram( + for: "[a-á]", + contains: [.consumeBy], + doesNotContain: [.matchBitset, .matchBitsetScalar]) + expectProgram( + for: "[a-fá-ém-zk]", + contains: [.matchBitset, .consumeBy], + doesNotContain: [.matchBitsetScalar]) } func testScalarOptimizeCompilation() { From 4b382cc636d4bf4e451e6bcce7b1464048660e27 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 11 Mar 2024 22:48:37 -0500 Subject: [PATCH 3/4] Handle advancing with possibile sub-character end. When searching in a substring with an endIndex that isn't on a character boundary, advancing from the penultimate character position to the substring's endIndex can end up failing, since advancing by a character finds the character-aligned index that is _past_ the substring end. This change handles that case. # Conflicts: # Sources/_StringProcessing/ConsumerInterface.swift --- .../_StringProcessing/Engine/Processor.swift | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 02d19f84c..ff2b9037f 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -175,14 +175,25 @@ extension Processor { // save point was restored mutating func consume(_ n: Distance) -> Bool { // TODO: needs benchmark coverage - guard let idx = input.index( + if let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end - ) else { - signalFailure() - return false + ) { + currentPosition = idx + return true } - currentPosition = idx - return true + + // If `end` falls in the middle of a character, and we are trying to advance + // by one "character", then we should max out at `end` even though the above + // advancement will result in `nil`. + if n == 1, let idx = input.unicodeScalars.index( + currentPosition, offsetBy: n.rawValue, limitedBy: end + ) { + currentPosition = idx + return true + } + + signalFailure() + return false } // Advances in unicode scalar view From 55340fce3451c594122e36187ce585f96baaa4f3 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 26 Mar 2024 19:11:14 -0500 Subject: [PATCH 4/4] Specific cases when switching to generate consumer --- Sources/_StringProcessing/ConsumerInterface.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 74841817e..bba659ebb 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -139,7 +139,7 @@ extension AST.Atom { // FIXME: implement return nil - default: + case .char, .scalar, .escaped, .dot, .caretAnchor, .dollarAnchor: fatalError("Handled in ByteCodeGen or earlier.") #if RESILIENT_LIBRARIES