diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index 4ec7a71dd..b50d1c213 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -30,6 +30,7 @@ extension Processor { } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -40,6 +41,7 @@ extension Processor { } func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -50,6 +52,8 @@ extension Processor { } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // TODO: needs benchmark coverage + // Future work: Optimize layout and dispatch switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 0e2f3923a..81b80b00e 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -3,10 +3,14 @@ extension Processor { var next: Input.Index? switch payload.type { case .bitset: - next = _doMatchBitset(registers[payload.bitset]) + next = input.matchBitset( + registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + next = input.matchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: end, + boundaryCheck: true) case .builtin: // We only emit .quantify if it consumes a single character next = input._matchBuiltinCC( @@ -16,6 +20,7 @@ extension Processor { isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: + // TODO: call out to existing code with quick check let matched = currentPosition != input.endIndex && (!input[currentPosition].isNewline || payload.anyMatchesNewline) next = matched ? input.index(after: currentPosition) : nil diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 753c3c3d1..372a7e1b4 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,13 +1,71 @@ extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 + var cycleCount: Int = 0 + + var isTracingEnabled: Bool = false + var shouldMeasureMetrics: Bool = false + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { + self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics + } } - +#else + struct ProcessorMetrics { + var isTracingEnabled: Bool { false } + var shouldMeasureMetrics: Bool { false } + var cycleCount: Int { 0 } + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } + } +#endif +} + +extension Processor { + + mutating func startCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + if metrics.cycleCount == 0 { + trace() + measureMetrics() + } +#endif + } + + mutating func endCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + metrics.cycleCount += 1 + trace() + measureMetrics() + _checkInvariants() +#endif + } +} + +extension Processor.ProcessorMetrics { + + mutating func addReset() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.resets += 1 +#endif + } + + mutating func addBacktrack() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.backtracks += 1 +#endif + } +} + +extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") - print("Total cycle count: \(cycleCount)") + print("Total cycle count: \(metrics.cycleCount)") print("Backtracks: \(metrics.backtracks)") print("Resets: \(metrics.resets)") print("Instructions:") @@ -21,7 +79,7 @@ extension Processor { } mutating func measure() { - let (opcode, _) = fetch().destructure + let (opcode, _) = fetch() if metrics.instructionCounts.keys.contains(opcode) { metrics.instructionCounts[opcode]! += 1 } else { @@ -30,8 +88,9 @@ extension Processor { } mutating func measureMetrics() { - if shouldMeasureMetrics { + if metrics.shouldMeasureMetrics { measure() } } +#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 98aee7803..3ff767207 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -86,11 +86,7 @@ struct Processor { var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. - var cycleCount = 0 - var isTracingEnabled: Bool - let shouldMeasureMetrics: Bool - var metrics: ProcessorMetrics = ProcessorMetrics() + var metrics: ProcessorMetrics } extension Processor { @@ -116,8 +112,11 @@ extension Processor { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.matchMode = matchMode - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics + + self.metrics = ProcessorMetrics( + isTracingEnabled: isTracingEnabled, + shouldMeasureMetrics: shouldMeasureMetrics) + self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds @@ -144,8 +143,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - - if shouldMeasureMetrics { metrics.resets += 1 } + + metrics.addReset() _checkInvariants() } @@ -160,6 +159,10 @@ extension Processor { } extension Processor { + func fetch() -> (Instruction.OpCode, Instruction.Payload) { + instructions[controller.pc].destructure + } + var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? @@ -177,6 +180,7 @@ extension Processor { // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -189,6 +193,7 @@ extension Processor { // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { + // TODO: need benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -220,24 +225,22 @@ extension Processor { func load() -> Element? { currentPosition < end ? input[currentPosition] : nil } - func load(count: Int) -> Input.SubSequence? { - let slice = self.slice[currentPosition...].prefix(count) - guard slice.count == count else { return nil } - return slice - } // Match against the current input element. Returns whether // it succeeded vs signaling an error. mutating func match(_ e: Element) -> Bool { - guard let cur = load(), cur == e else { + guard let next = input.match( + e, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } mutating func matchCaseInsensitive(_ e: Element) -> Bool { + // TODO: need benchmark coverage guard let cur = load(), cur.lowercased() == e.lowercased() else { signalFailure() return false @@ -253,15 +256,21 @@ extension Processor { isScalarMode: Bool ) -> Bool { if isScalarMode { + // TODO: needs benchmark coverage for s in seq.unicodeScalars { guard matchScalar(s, boundaryCheck: false) else { return false } } return true } - for e in seq { - guard match(e) else { return false } + guard let next = input.matchSeq( + seq, at: currentPosition, limitedBy: end + ) else { + signalFailure() + return false } + + currentPosition = next return true } @@ -269,21 +278,10 @@ extension Processor { currentPosition < end ? input.unicodeScalars[currentPosition] : nil } - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { - if s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return idx - } else { - return nil - } - } - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard let next = _doMatchScalar(s, boundaryCheck) else { + guard let next = input.matchScalar( + s, at: currentPosition, limitedBy: end, boundaryCheck: boundaryCheck + ) else { signalFailure() return false } @@ -295,6 +293,7 @@ extension Processor { _ s: Unicode.Scalar, boundaryCheck: Bool ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, let idx = input.unicodeScalars.index( @@ -310,21 +309,15 @@ extension Processor { return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { - if let cur = load(), bitset.matches(char: cur) { - return input.index(after: currentPosition) - } else { - return nil - } - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { - guard let next = _doMatchBitset(bitset) else { + guard let next = input.matchBitset( + bitset, at: currentPosition, limitedBy: end + ) else { signalFailure() return false } @@ -336,6 +329,7 @@ extension Processor { mutating func matchBitsetScalar( _ bitset: DSLTree.CustomCharacterClass.AsciiBitset ) -> Bool { + // TODO: needs benchmark coverage guard let curScalar = loadScalar(), bitset.matches(scalar: curScalar), let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { @@ -396,8 +390,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters - - if shouldMeasureMetrics { metrics.backtracks += 1 } + + metrics.addBacktrack() } mutating func abort(_ e: Error? = nil) { @@ -436,20 +430,10 @@ extension Processor { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED - if cycleCount == 0 { - trace() - measureMetrics() - } - defer { - cycleCount += 1 - trace() - measureMetrics() - _checkInvariants() - } -#endif + startCycleMetrics() + defer { endCycleMetrics() } - let (opcode, payload) = fetch().destructure + let (opcode, payload) = fetch() switch opcode { case .invalid: fatalError("Invalid program") @@ -703,3 +687,86 @@ extension Processor { } } } + +extension String { + + func match( + _ char: Character, + at pos: Index, + limitedBy end: String.Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + guard pos < end, self[pos] == char else { return nil } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + func matchSeq( + _ seq: Substring, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + var cur = pos + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } + + guard cur <= end else { return nil } + return cur + } + + func matchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy end: String.Index, + boundaryCheck: Bool + ) -> Index? { + assert(end <= endIndex) + + guard pos < end, unicodeScalars[pos] == scalar else { + return nil + } + + let idx = unicodeScalars.index(after: pos) + guard idx <= end else { return nil } + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + + func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy end: Index + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + + assert(end <= endIndex) + + guard pos < end, bitset.matches(char: self[pos]) else { + return nil + } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + +} diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..b0ce67555 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -9,7 +9,12 @@ // //===----------------------------------------------------------------------===// + +// TODO: Remove this protocol (and/or reuse it for something like a FastProcessor) extension Processor: TracedProcessor { + var cycleCount: Int { metrics.cycleCount } + var isTracingEnabled: Bool { metrics.isTracingEnabled } + var isFailState: Bool { state == .fail } var isAcceptState: Bool { state == .accept } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 253858d1f..0453fcd80 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,7 +31,7 @@ struct Executor { subjectBounds: subjectBounds, searchBounds: searchBounds) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif var low = searchBounds.lowerBound let high = searchBounds.upperBound @@ -60,7 +60,7 @@ struct Executor { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 50da079f6..f1f9573c1 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,7 @@ @_spi(_Unicode) import Swift +// TODO: Sink onto String extension Processor { func atSimpleBoundary( _ usesAsciiWord: Bool, @@ -20,9 +21,11 @@ extension Processor { func matchesWord(at i: Input.Index) -> Bool { switch semanticLevel { case .graphemeCluster: + // TODO: needs benchmark coverage let c = input[i] return c.isWordCharacter && (c.isASCII || !usesAsciiWord) case .unicodeScalar: + // TODO: needs benchmark coverage let c = input.unicodeScalars[i] return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) } @@ -51,6 +54,7 @@ extension String { using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { + // TODO: needs benchmark coverage guard i != startIndex, i != endIndex else { return true } diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 7542a17dd..24ffbcf70 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -44,13 +44,3 @@ protocol ProcessorProtocol { var registers: Registers { get } } -extension ProcessorProtocol { - func fetch() -> Instruction { - instructions[currentPC] - } - - var callStack: Array { [] } -// var savePoints: Array { [] } - var registers: Array { [] } - -} diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 112a601b1..198564fe1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -13,7 +13,7 @@ // TODO: Place shared formatting and trace infrastructure here protocol Traced { - var isTracingEnabled: Bool { get set } + var isTracingEnabled: Bool { get } } protocol TracedProcessor: ProcessorProtocol, Traced {