From 8c151e0286ba35e04d60cf71a572c0a47287f708 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 19 Jun 2022 10:08:45 -0600 Subject: [PATCH 1/2] Avoid double execution by avoiding Array init --- .../_StringProcessing/Algorithms/Matching/Matches.swift | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift index 094d3dfdd..08b25bcd5 100644 --- a/Sources/_StringProcessing/Algorithms/Matching/Matches.swift +++ b/Sources/_StringProcessing/Algorithms/Matching/Matches.swift @@ -349,6 +349,12 @@ extension BidirectionalCollection where SubSequence == Substring { public func matches( of r: some RegexComponent ) -> [Regex.Match] { - Array(_matches(of: r)) + // FIXME: Array init calls count, which double-executes the regex :-( + // FIXME: just return some Collection.Match> + var result = Array.Match>() + for match in _matches(of: r) { + result.append(match) + } + return result } } From 8c9ca19707f0a5410b4ee42c55af1a10bdd08f55 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Sun, 19 Jun 2022 10:38:21 -0600 Subject: [PATCH 2/2] De-genericize processor, engine, etc. Provides only modest performance improvements (it was already getting specialized), but makes it possible to add String-specific specializations. --- Sources/_StringProcessing/ByteCodeGen.swift | 4 +-- Sources/_StringProcessing/Compiler.swift | 2 +- .../_StringProcessing/ConsumerInterface.swift | 34 +++++++++---------- .../_StringProcessing/Engine/Consume.swift | 6 ++-- Sources/_StringProcessing/Engine/Engine.swift | 6 ++-- .../_StringProcessing/Engine/MEBuilder.swift | 10 +++--- .../_StringProcessing/Engine/MECapture.swift | 2 +- .../_StringProcessing/Engine/MEProgram.swift | 6 ++-- .../_StringProcessing/Engine/Processor.swift | 9 ++--- .../_StringProcessing/Engine/Registers.swift | 22 ++++++------ .../_StringProcessing/Engine/Tracing.swift | 2 +- Sources/_StringProcessing/Executor.swift | 6 ++-- Sources/_StringProcessing/Regex/Core.swift | 6 ++-- Tests/Prototypes/PEG/PEGTranspile.swift | 2 +- 14 files changed, 59 insertions(+), 58 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index bcfc8a2c2..cff0df57e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -3,7 +3,7 @@ extension Compiler { struct ByteCodeGen { var options: MatchingOptions - var builder = Program.Builder() + var builder = MEProgram.Builder() /// A Boolean indicating whether the first matchable atom has been emitted. /// This is used to determine whether to apply initial options. var hasEmittedFirstMatchableAtom = false @@ -16,7 +16,7 @@ extension Compiler { } extension Compiler.ByteCodeGen { - mutating func emitRoot(_ root: DSLTree.Node) throws -> Program { + mutating func emitRoot(_ root: DSLTree.Node) throws -> MEProgram { // The whole match (`.0` element of output) is equivalent to an implicit // capture over the entire regex. try emitNode(.capture(name: nil, reference: nil, root)) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 601cd52a4..8961a1b88 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -25,7 +25,7 @@ class Compiler { self.tree = tree } - __consuming func emit() throws -> Program { + __consuming func emit() throws -> MEProgram { // TODO: Handle global options var codegen = ByteCodeGen( options: options, captureList: tree.captureList diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a912fd136..07757eb6a 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -18,7 +18,7 @@ extension DSLTree.Node { /// the front of an input range func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction? { switch self { case .atom(let a): return try a.generateConsumer(opts) @@ -56,7 +56,7 @@ extension DSLTree.Atom { // top-level nodes, but it's also invoked for `.atom` members of a custom CC func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction? { let isCaseInsensitive = opts.isCaseInsensitive switch self { @@ -142,7 +142,7 @@ extension String { } } -func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { +func consumeName(_ name: String, opts: MatchingOptions) -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) return consume(propertyScalarPredicate { // FIXME: name aliases not covered by $0.nameAlias are missed @@ -180,7 +180,7 @@ extension AST.Atom { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction? { // TODO: Wean ourselves off of this type... if let cc = self.characterClass?.withMatchLevel( opts.matchLevel @@ -237,7 +237,7 @@ extension AST.Atom { extension DSLTree.CustomCharacterClass.Member { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { + ) throws -> MEProgram.ConsumeFunction { switch self { case let .atom(a): guard let c = try a.generateConsumer(opts) else { @@ -344,7 +344,7 @@ extension DSLTree.CustomCharacterClass.Member { extension DSLTree.CustomCharacterClass { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { + ) throws -> MEProgram.ConsumeFunction { // NOTE: Easy way to implement, obviously not performant let consumers = try members.map { try $0.generateConsumer(opts) @@ -386,7 +386,7 @@ private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) func consumeScalar( _ p: @escaping ScalarPredicate -) -> MEProgram.ConsumeFunction { +) -> MEProgram.ConsumeFunction { { input, bounds in // TODO: bounds check? let curIdx = bounds.lowerBound @@ -399,7 +399,7 @@ func consumeScalar( } func consumeCharacterWithLeadingScalar( _ p: @escaping ScalarPredicate -) -> MEProgram.ConsumeFunction { +) -> MEProgram.ConsumeFunction { { input, bounds in let curIdx = bounds.lowerBound if p(input[curIdx].unicodeScalars.first!) { @@ -410,7 +410,7 @@ func consumeCharacterWithLeadingScalar( } func consumeCharacterWithSingleScalar( _ p: @escaping ScalarPredicate -) -> MEProgram.ConsumeFunction { +) -> MEProgram.ConsumeFunction { { input, bounds in let curIdx = bounds.lowerBound @@ -423,7 +423,7 @@ func consumeCharacterWithSingleScalar( func consumeFunction( for opts: MatchingOptions -) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { +) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { opts.semanticLevel == .graphemeCluster ? consumeCharacterWithLeadingScalar : consumeScalar @@ -432,11 +432,11 @@ func consumeFunction( extension AST.Atom.CharacterProperty { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { + ) throws -> MEProgram.ConsumeFunction { // Handle inversion for us, albeit not efficiently func invert( - _ p: @escaping MEProgram.ConsumeFunction - ) -> MEProgram.ConsumeFunction { + _ p: @escaping MEProgram.ConsumeFunction + ) -> MEProgram.ConsumeFunction { return { input, bounds in if p(input, bounds) != nil { return nil } @@ -448,7 +448,7 @@ extension AST.Atom.CharacterProperty { } let consume = consumeFunction(for: opts) - let preInversion: MEProgram.ConsumeFunction = + let preInversion: MEProgram.ConsumeFunction = try { switch kind { // TODO: is this modeled differently? @@ -533,7 +533,7 @@ extension Unicode.BinaryProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { + ) throws -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) // Note if you implement support for any of the below, you need to adjust @@ -701,7 +701,7 @@ extension Unicode.POSIXProperty { // FIXME: Semantic level, vet for precise defs func generateConsumer( _ opts: MatchingOptions - ) -> MEProgram.ConsumeFunction { + ) -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) // FIXME: modes, etc @@ -749,7 +749,7 @@ extension Unicode.ExtendedGeneralCategory { // FIXME: Semantic level func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { + ) throws -> MEProgram.ConsumeFunction { let consume = consumeFunction(for: opts) switch self { diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index bc60ba260..3d3524d80 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -13,8 +13,8 @@ var checkComments = true extension Engine { func makeProcessor( - input: Input, bounds: Range, matchMode: MatchMode - ) -> Processor { + input: String, bounds: Range, matchMode: MatchMode + ) -> Processor { Processor( program: program, input: input, @@ -24,7 +24,7 @@ extension Engine { } } -extension Processor where Input == String { +extension Processor { // TODO: Should we throw here? mutating func consume() -> Input.Index? { while true { diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift index 86952c8b7..9e67e4639 100644 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ b/Sources/_StringProcessing/Engine/Engine.swift @@ -11,9 +11,9 @@ // Currently, engine binds the type and consume binds an instance. // But, we can play around with this. -struct Engine where Input.Element: Hashable { +struct Engine { - var program: MEProgram + var program: MEProgram // TODO: Pre-allocated register banks @@ -25,7 +25,7 @@ struct Engine where Input.Element: Hashable { } init( - _ program: MEProgram, + _ program: MEProgram, enableTracing: Bool? = nil ) { var program = program diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 13b2d3798..f998a4952 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -11,7 +11,7 @@ @_implementationOnly import _RegexParser // For errors -extension MEProgram where Input.Element: Hashable { +extension MEProgram { struct Builder { var instructions: [Instruction] = [] @@ -71,7 +71,7 @@ extension MEProgram.Builder { // TODO: We want a better strategy for fixups, leaving // the operand in a different form isn't great... - init(staticElements: S) where S.Element == Input.Element { + init(staticElements: S) where S.Element == Character { staticElements.forEach { elements.store($0) } } @@ -183,14 +183,14 @@ extension MEProgram.Builder { instructions.append(.init(.advance, .init(distance: n))) } - mutating func buildMatch(_ e: Input.Element) { + mutating func buildMatch(_ e: Character) { instructions.append(.init( .match, .init(element: elements.store(e)))) } mutating func buildMatchSequence( _ s: S - ) where S.Element == Input.Element { + ) where S.Element == Character { instructions.append(.init( .matchSequence, .init(sequence: sequences.store(.init(s))))) @@ -219,7 +219,7 @@ extension MEProgram.Builder { } mutating func buildAssert( - _ e: Input.Element, into cond: BoolRegister + _ e: Character, into cond: BoolRegister ) { instructions.append(.init(.assertion, .init( element: elements.store(e), bool: cond))) diff --git a/Sources/_StringProcessing/Engine/MECapture.swift b/Sources/_StringProcessing/Engine/MECapture.swift index ec7c3668a..53243cd34 100644 --- a/Sources/_StringProcessing/Engine/MECapture.swift +++ b/Sources/_StringProcessing/Engine/MECapture.swift @@ -95,7 +95,7 @@ extension Processor._StoredCapture: CustomStringConvertible { } struct MECaptureList { - var values: Array._StoredCapture> + var values: Array var referencedCaptureOffsets: [ReferenceID: Int] func latestUntyped(from input: String) -> Array { diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 52aef1511..dd166e554 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -11,12 +11,14 @@ @_implementationOnly import _RegexParser -struct MEProgram where Input.Element: Equatable { +struct MEProgram { + typealias Input = String + typealias ConsumeFunction = (Input, Range) -> Input.Index? typealias AssertionFunction = (Input, Input.Index, Range) throws -> Bool typealias TransformFunction = - (Input, Processor._StoredCapture) throws -> Any? + (Input, Processor._StoredCapture) throws -> Any? typealias MatcherFunction = (Input, Input.Index, Range) throws -> (Input.Index, Any)? diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a81d2ce06..1717e485d 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -14,8 +14,6 @@ enum MatchMode { case partialFromFront } -typealias Program = MEProgram - /// A concrete CU. Somehow will run the concrete logic and /// feed stuff back to generic code struct Controller { @@ -26,9 +24,8 @@ struct Controller { } } -struct Processor< - Input: BidirectionalCollection -> where Input.Element: Equatable { // maybe Hashable? +struct Processor { + typealias Input = String typealias Element = Input.Element let input: Input @@ -75,7 +72,7 @@ extension Processor { extension Processor { init( - program: MEProgram, + program: MEProgram, input: Input, bounds: Range, matchMode: MatchMode, diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index e6f823341..37ab1cdee 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -29,15 +29,15 @@ extension Processor { // TODO: Degenericize Processor and store Strings var sequences: [[Element]] = [] - var consumeFunctions: [MEProgram.ConsumeFunction] + var consumeFunctions: [MEProgram.ConsumeFunction] - var assertionFunctions: [MEProgram.AssertionFunction] + var assertionFunctions: [MEProgram.AssertionFunction] // Captured-value constructors - var transformFunctions: [MEProgram.TransformFunction] + var transformFunctions: [MEProgram.TransformFunction] // Value-constructing matchers - var matcherFunctions: [MEProgram.MatcherFunction] + var matcherFunctions: [MEProgram.MatcherFunction] // currently, these are for comments and abort messages var strings: [String] @@ -58,6 +58,8 @@ extension Processor { } extension Processor.Registers { + typealias Input = String + subscript(_ i: StringRegister) -> String { strings[i.rawValue] } @@ -85,24 +87,24 @@ extension Processor.Registers { subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } - subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { + subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { consumeFunctions[i.rawValue] } - subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { + subscript(_ i: AssertionFunctionRegister) -> MEProgram.AssertionFunction { assertionFunctions[i.rawValue] } - subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { + subscript(_ i: TransformRegister) -> MEProgram.TransformFunction { transformFunctions[i.rawValue] } - subscript(_ i: MatcherRegister) -> MEProgram.MatcherFunction { + subscript(_ i: MatcherRegister) -> MEProgram.MatcherFunction { matcherFunctions[i.rawValue] } } extension Processor.Registers { init( - _ program: MEProgram, - _ sentinel: Input.Index + _ program: MEProgram, + _ sentinel: String.Index ) { let info = program.registerInfo diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 24d00d3d7..525beec63 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -54,7 +54,7 @@ extension Instruction.Payload: CustomStringConvertible { } extension Processor.SavePoint { - func describe(in input: Input) -> String { + func describe(in input: String) -> String { let posStr: String if let p = self.pos { posStr = "\(input.distance(from: input.startIndex, to: p))" diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 295a732de..f8d10001e 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -13,9 +13,9 @@ struct Executor { // TODO: consider let, for now lets us toggle tracing - var engine: Engine + var engine: Engine - init(program: Program, enablesTracing: Bool = false) { + init(program: MEProgram, enablesTracing: Bool = false) { self.engine = Engine(program, enableTracing: enablesTracing) } @@ -61,7 +61,7 @@ struct Executor { func _match( _ input: String, in inputRange: Range, - using cpu: inout Processor + using cpu: inout Processor ) throws -> Regex.Match? { guard let endIdx = cpu.consume() else { if let e = cpu.failureReason { diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 882d9069d..b96ccda58 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -81,15 +81,15 @@ extension Regex { let tree: DSLTree private final class ProgramBox { - let value: MEProgram - init(_ value: MEProgram) { self.value = value } + let value: MEProgram + init(_ value: MEProgram) { self.value = value } } /// Do not use directly - all accesses must go through `loweredProgram`. private var _loweredProgramStorage: AnyObject? = nil /// The program for execution with the matching engine. - var loweredProgram: MEProgram { + var loweredProgram: MEProgram { if let loweredObject = _loweredProgramStorage as? ProgramBox { return loweredObject.value } diff --git a/Tests/Prototypes/PEG/PEGTranspile.swift b/Tests/Prototypes/PEG/PEGTranspile.swift index 84e220d52..91867bdb8 100644 --- a/Tests/Prototypes/PEG/PEGTranspile.swift +++ b/Tests/Prototypes/PEG/PEGTranspile.swift @@ -12,7 +12,7 @@ @testable import _StringProcessing extension PEG.VM where Input == String { - typealias MEProg = MEProgram + typealias MEProg = MEProgram func transpile() throws -> MEProg { typealias Builder = MEProg.Builder var builder = MEProg.Builder()