diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index df4168c2d..2622639fb 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -1,15 +1,45 @@ -import _StringProcessing +@_spi(RegexBenchmark) import _StringProcessing +@_implementationOnly import _RegexParser import Foundation -protocol RegexBenchmark { +protocol RegexBenchmark: Debug { var name: String { get } func run() - func debug() } -struct Benchmark: RegexBenchmark { +protocol SwiftRegexBenchmark: RegexBenchmark { + var regex: Regex { get set } + var pattern: String? { get } +} + +extension SwiftRegexBenchmark { + mutating func compile() { + let _ = regex._forceAction(.recompile) + } + mutating func parse() -> Bool { + guard let s = pattern else { + return false + } + + do { + let _ = try _RegexParser.parse(s, .traditional) + return true + } catch { + return false + } + } + mutating func enableTracing() { + let _ = regex._forceAction(.addOptions(.enableTracing)) + } + mutating func enableMetrics() { + let _ = regex._forceAction(.addOptions([.enableMetrics])) + } +} + +struct Benchmark: SwiftRegexBenchmark { let name: String - let regex: Regex + var regex: Regex + let pattern: String? let type: MatchType let target: String @@ -52,11 +82,12 @@ struct NSBenchmark: RegexBenchmark { } /// A benchmark running a regex on strings in input set -struct InputListBenchmark: RegexBenchmark { +struct InputListBenchmark: SwiftRegexBenchmark { let name: String - let regex: Regex + var regex: Regex + let pattern: String? let targets: [String] - + func run() { for target in targets { blackHole(target.wholeMatch(of: regex)) @@ -78,7 +109,7 @@ struct InputListNSBenchmark: RegexBenchmark { func range(in target: String) -> NSRange { NSRange(target.startIndex.. BenchmarkRunner { - var benchmark = BenchmarkRunner("RegexBench", samples, quiet) + mutating func registerDefault() { // -- start of registrations -- - benchmark.addReluctantQuant() - benchmark.addCSS() - benchmark.addNotFound() - benchmark.addGraphemeBreak() - benchmark.addHangulSyllable() - // benchmark.addHTML() // Disabled due to \b being unusably slow - benchmark.addEmail() - benchmark.addCustomCharacterClasses() - benchmark.addBuiltinCC() - benchmark.addUnicode() - benchmark.addLiteralSearch() - benchmark.addDiceNotation() - benchmark.addErrorMessages() - benchmark.addIpAddress() + self.addReluctantQuant() + self.addCSS() + self.addNotFound() + self.addGraphemeBreak() + self.addHangulSyllable() + // self.addHTML() // Disabled due to \b being unusably slow + self.addEmail() + self.addCustomCharacterClasses() + self.addBuiltinCC() + self.addUnicode() + self.addLiteralSearch() + self.addDiceNotation() + self.addErrorMessages() + self.addIpAddress() // -- end of registrations -- - return benchmark } } diff --git a/Sources/RegexBenchmark/BenchmarkResults.swift b/Sources/RegexBenchmark/BenchmarkResults.swift new file mode 100644 index 000000000..ae9c5ded2 --- /dev/null +++ b/Sources/RegexBenchmark/BenchmarkResults.swift @@ -0,0 +1,277 @@ +import Foundation + +extension BenchmarkRunner { + /// Attempts to save the results to the given path + func save(to savePath: String) throws { + let url = URL(fileURLWithPath: savePath, isDirectory: false) + let parent = url.deletingLastPathComponent() + if !FileManager.default.fileExists(atPath: parent.path) { + try! FileManager.default.createDirectory( + atPath: parent.path, + withIntermediateDirectories: true) + } + print("Saving result to \(url.path)") + try results.save(to: url) + } + + /// Attempts to load the results from the given save file + mutating func load(from savePath: String) throws { + let url = URL(fileURLWithPath: savePath) + let result = try SuiteResult.load(from: url) + self.results = result + print("Loaded results from \(url.path)") + } + + /// Compare this runner's results against the results stored in the given file path + func compare( + against compareFilePath: String, + showChart: Bool, + saveTo: String? + ) throws { + let compareFileURL = URL(fileURLWithPath: compareFilePath) + let compareResult = try SuiteResult.load(from: compareFileURL) + let compareFile = compareFileURL.lastPathComponent + + let comparisons = results + .compare(with: compareResult) + .filter({!$0.name.contains("_NS")}) + .filter({$0.diff != nil}) + displayComparisons( + comparisons, + showChart, + against: "saved benchmark result " + compareFile) + if let saveFile = saveTo { + try saveComparisons(comparisons, path: saveFile) + } + } + + // Compile times are often very short (5-20ยตs) so results are likely to be + // very affected by background tasks. This is primarily for making sure + // there aren't any catastrophic changes in compile times + func compareCompileTimes( + against compareFilePath: String, + showChart: Bool + ) throws { + let compareFileURL = URL(fileURLWithPath: compareFilePath) + let compareResult = try SuiteResult.load(from: compareFileURL) + let compareFile = compareFileURL.lastPathComponent + + let compileTimeComparisons = results + .compareCompileTimes(with: compareResult) + .filter({!$0.name.contains("_NS")}) + .filter({$0.diff != nil}) + print("Comparing estimated compile times") + displayComparisons( + compileTimeComparisons, + false, + against: "saved benchmark result " + compareFile) + } + + /// Compares Swift Regex benchmark results against NSRegularExpression + func compareWithNS(showChart: Bool, saveTo: String?) throws { + let comparisons = results.compareWithNS().filter({$0.diff != nil}) + displayComparisons( + comparisons, + showChart, + against: "NSRegularExpression (via CrossBenchmark)") + if let saveFile = saveTo { + try saveComparisons(comparisons, path: saveFile) + } + } + + func displayComparisons( + _ comparisons: [BenchmarkResult.Comparison], + _ showChart: Bool, + against: String + ) { + let regressions = comparisons.filter({$0.diff!.seconds > 0}) + .sorted(by: {(a,b) in a.diff!.seconds > b.diff!.seconds}) + let improvements = comparisons.filter({$0.diff!.seconds < 0}) + .sorted(by: {(a,b) in a.diff!.seconds < b.diff!.seconds}) + + print("Comparing against \(against)") + print("=== Regressions ======================================================================") + for item in regressions { + print(item) + } + + print("=== Improvements =====================================================================") + for item in improvements { + print(item) + } + + #if os(macOS) && canImport(Charts) + if showChart { + print(""" + === Comparison chart ================================================================= + Press Control-C to close... + """) + BenchmarkResultApp.comparisons = comparisons + BenchmarkResultApp.main() + } + #endif + } + + func saveComparisons( + _ comparisons: [BenchmarkResult.Comparison], + path: String + ) throws { + let url = URL(fileURLWithPath: path, isDirectory: false) + let parent = url.deletingLastPathComponent() + if !FileManager.default.fileExists(atPath: parent.path) { + try! FileManager.default.createDirectory( + atPath: parent.path, + withIntermediateDirectories: true) + } + + var contents = "name,latest,baseline,diff,percentage\n" + for comparison in comparisons { + contents += comparison.asCsv + "\n" + } + print("Saving comparisons as .csv to \(path)") + try contents.write(to: url, atomically: true, encoding: String.Encoding.utf8) + } +} + +struct Measurement: Codable, CustomStringConvertible { + let median: Time + let stdev: Double + let samples: Int + + init(results: [Time]) { + let sorted = results.sorted() + self.samples = sorted.count + self.median = sorted[samples/2] + let sum = results.reduce(0.0) {acc, next in acc + next.seconds} + let mean = sum / Double(samples) + let squareDiffs = results.reduce(0.0) { acc, next in + acc + pow(next.seconds - mean, 2) + } + self.stdev = (squareDiffs / Double(samples)).squareRoot() + } + + var description: String { + return "\(median) (stdev: \(Time(stdev)), N = \(samples))" + } +} + +struct BenchmarkResult: Codable, CustomStringConvertible { + let runtime: Measurement + let compileTime: Measurement? + let parseTime: Measurement? + + var description: String { + var base = " > run time: \(runtime.description)" + if let compileTime = compileTime { + base += "\n > compile time: \(compileTime)" + } + if let parseTime = parseTime { + base += "\n > parse time: \(parseTime)" + } + return base + } +} + +extension BenchmarkResult { + struct Comparison: Identifiable, CustomStringConvertible { + var id = UUID() + var name: String + var baseline: Measurement + var latest: Measurement + + var latestTime: Time { latest.median } + var baselineTime: Time { baseline.median } + var diff: Time? { + if Stats.tTest(baseline, latest) { + return latestTime - baselineTime + } + return nil + } + var normalizedDiff: Double { + latestTime.seconds/baselineTime.seconds + } + + var description: String { + guard let diff = diff else { + return "- \(name) N/A" + } + let percentage = (1000 * diff.seconds / baselineTime.seconds).rounded()/10 + let len = max(40 - name.count, 1) + let nameSpacing = String(repeating: " ", count: len) + return "- \(name)\(nameSpacing)\(latestTime)\t\(baselineTime)\t\(diff)\t\t\(percentage)%" + } + + var asCsv: String { + guard let diff = diff else { + return "\(name),N/A" + } + let percentage = (1000 * diff.seconds / baselineTime.seconds).rounded()/10 + return "\"\(name)\",\(latestTime.seconds),\(baselineTime.seconds),\(diff.seconds),\(percentage)%" + } + } +} + +struct SuiteResult { + var results: [String: BenchmarkResult] = [:] + + mutating func add(name: String, result: BenchmarkResult) { + results.updateValue(result, forKey: name) + } + + func compare(with other: SuiteResult) -> [BenchmarkResult.Comparison] { + var comparisons: [BenchmarkResult.Comparison] = [] + for latest in results { + if let otherVal = other.results[latest.key] { + comparisons.append( + .init(name: latest.key, + baseline: otherVal.runtime, latest: latest.value.runtime)) + } + } + return comparisons + } + + /// Compares with the NSRegularExpression benchmarks generated by CrossBenchmark + func compareWithNS() -> [BenchmarkResult.Comparison] { + var comparisons: [BenchmarkResult.Comparison] = [] + for latest in results { + let key = latest.key + CrossBenchmark.nsSuffix + if let nsResult = results[key] { + comparisons.append( + .init(name: latest.key, + baseline: nsResult.runtime, latest: latest.value.runtime)) + } + } + return comparisons + } + + func compareCompileTimes( + with other: SuiteResult + ) -> [BenchmarkResult.Comparison] { + var comparisons: [BenchmarkResult.Comparison] = [] + for latest in results { + if let baseline = other.results[latest.key], + let baselineTime = baseline.compileTime, + let latestTime = latest.value.compileTime { + comparisons.append( + .init(name: latest.key, + baseline: baselineTime, + latest: latestTime)) + } + } + return comparisons + } +} + +extension SuiteResult: Codable { + func save(to url: URL) throws { + let encoder = JSONEncoder() + let data = try encoder.encode(self) + try data.write(to: url, options: .atomic) + } + + static func load(from url: URL) throws -> SuiteResult { + let decoder = JSONDecoder() + let data = try Data(contentsOf: url) + return try decoder.decode(SuiteResult.self, from: data) + } +} diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 78953bdd6..1a62858c1 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -1,4 +1,5 @@ import Foundation +@_spi(RegexBenchmark) import _StringProcessing struct BenchmarkRunner { let suiteName: String @@ -7,60 +8,99 @@ struct BenchmarkRunner { let samples: Int var results: SuiteResult = SuiteResult() let quiet: Bool - - init(_ suiteName: String, _ n: Int, _ quiet: Bool) { - self.suiteName = suiteName - self.samples = n - self.quiet = quiet + let enableTracing: Bool + let enableMetrics: Bool + + // Forcibly include firstMatch benchmarks for all CrossBenchmarks + let includeFirstOverride: Bool + + mutating func register(_ benchmark: some RegexBenchmark) { + suite.append(benchmark) } - mutating func register(_ new: some RegexBenchmark) { - suite.append(new) + mutating func register(_ benchmark: some SwiftRegexBenchmark) { + var benchmark = benchmark + if enableTracing { + benchmark.enableTracing() + } + if enableMetrics { + benchmark.enableMetrics() + } + suite.append(benchmark) } - mutating func measure(benchmark: some RegexBenchmark, samples: Int) -> BenchmarkResult { + func medianMeasure( + samples: Int, + closure: () -> Void + ) -> Measurement { + // FIXME: use suspendingclock? var times: [Time] = [] - - // initial run to make sure the regex has been compiled - // todo: measure compile times, or at least how much this first run - // differs from the later ones - benchmark.run() - - // fixme: use suspendingclock? for _ in 0.. BenchmarkResult { + // Initial run to make sure the regex has been compiled + benchmark.run() - times.sort() - let median = times[samples/2] - let mean = times.reduce(0.0, {acc, next in acc + next.seconds}) / Double(times.count) - let stdev = (times.reduce(0.0, {acc, next in acc + pow(next.seconds - mean, 2)}) / Double(times.count)).squareRoot() - return BenchmarkResult(median, stdev, samples) + // Measure compilataion time for Swift regex + let compileTime: Measurement? + let parseTime: Measurement? + if benchmark is SwiftRegexBenchmark { + var benchmark = benchmark as! SwiftRegexBenchmark + compileTime = medianMeasure(samples: samples) { benchmark.compile() } + // Can't parse if we don't have an input string (ie a builder regex) + if benchmark.pattern != nil { + parseTime = medianMeasure(samples: samples) { let _ = benchmark.parse() } + } else { + parseTime = nil + } + + } else { + compileTime = nil + parseTime = nil + } + + let runtime = medianMeasure(samples: samples) { benchmark.run() } + return BenchmarkResult( + runtime: runtime, + compileTime: compileTime, + parseTime: parseTime) } mutating func run() { print("Running") for b in suite { var result = measure(benchmark: b, samples: samples) - if !quiet { - print("- \(b.name) \(result.median) (stdev: \(Time(result.stdev)))") - } - - if result.stdev > Stats.maxAllowedStdev { - print("Warning: Standard deviation > \(Time(Stats.maxAllowedStdev)) for \(b.name)") - print("N = \(samples), median: \(result.median), stdev: \(Time(result.stdev))") + if result.runtimeIsTooVariant { + print("Warning: Standard deviation > \(Stats.maxAllowedStdev*100)% for \(b.name)") + print(result.runtime) print("Rerunning \(b.name)") - result = measure(benchmark: b, samples: result.samples*2) - print("N = \(result.samples), median: \(result.median), stdev: \(Time(result.stdev))") - if result.stdev > Stats.maxAllowedStdev { + result = measure(benchmark: b, samples: result.runtime.samples*2) + print(result.runtime) + if result.runtimeIsTooVariant { fatalError("Benchmark \(b.name) is too variant") } } + if result.compileTime?.median ?? .zero > Time.millisecond { + print("Warning: Abnormally high compilation time, what happened?") + } + if result.parseTime?.median ?? .zero > Time.millisecond { + print("Warning: Abnormally high parse time, what happened?") + } + if !quiet { + print("- \(b.name)\n\(result)") + } self.results.add(name: b.name, result: result) } } @@ -69,104 +109,8 @@ struct BenchmarkRunner { print("Debugging") print("========================") for b in suite { - let result = measure(benchmark: b, samples: samples) - print("- \(b.name) \(result.median) (stdev: \(Time(result.stdev)))") b.debug() print("========================") } } } - -extension BenchmarkRunner { - - func save(to savePath: String) throws { - let url = URL(fileURLWithPath: savePath, isDirectory: false) - let parent = url.deletingLastPathComponent() - if !FileManager.default.fileExists(atPath: parent.path) { - try! FileManager.default.createDirectory(atPath: parent.path, withIntermediateDirectories: true) - } - print("Saving result to \(url.path)") - try results.save(to: url) - } - - func compare(against compareFilePath: String) throws { - let compareFileURL = URL(fileURLWithPath: compareFilePath) - let compareResult = try SuiteResult.load(from: compareFileURL) - let compareFile = compareFileURL.lastPathComponent - - let diff = results - .compare(with: compareResult) - .filter({(name, _) in !name.contains("_NS")}) - let regressions = diff.filter({(_, change) in change.seconds > 0}) - .sorted(by: {(a,b) in a.1 > b.1}) - let improvements = diff.filter({(_, change) in change.seconds < 0}) - .sorted(by: {(a,b) in a.1 < b.1}) - - print("Comparing against benchmark result file \(compareFile)") - print("=== Regressions ======================================================================") - func printComparison(name: String, diff: Time) { - let oldVal = compareResult.results[name]!.median - let newVal = results.results[name]!.median - let percentage = (1000 * diff.seconds / oldVal.seconds).rounded()/10 - let len = max(40 - name.count, 1) - let nameSpacing = String(repeating: " ", count: len) - print("- \(name)\(nameSpacing)\(newVal)\t\(oldVal)\t\(diff)\t\t\(percentage)%") - } - - for item in regressions { - printComparison(name: item.key, diff: item.value) - } - - print("=== Improvements =====================================================================") - for item in improvements { - printComparison(name: item.key, diff: item.value) - } - } -} - -struct BenchmarkResult: Codable { - let median: Time - let stdev: Double - let samples: Int - - init(_ median: Time, _ stdev: Double, _ samples: Int) { - self.median = median - self.stdev = stdev - self.samples = samples - } -} - -struct SuiteResult { - var results: [String: BenchmarkResult] = [:] - - mutating func add(name: String, result: BenchmarkResult) { - results.updateValue(result, forKey: name) - } - - func compare(with other: SuiteResult) -> [String: Time] { - var output: [String: Time] = [:] - for item in results { - if let otherVal = other.results[item.key] { - let diff = item.value.median - otherVal.median - if Stats.tTest(item.value, otherVal) { - output.updateValue(diff, forKey: item.key) - } - } - } - return output - } -} - -extension SuiteResult: Codable { - func save(to url: URL) throws { - let encoder = JSONEncoder() - let data = try encoder.encode(self) - try data.write(to: url, options: .atomic) - } - - static func load(from url: URL) throws -> SuiteResult { - let decoder = JSONDecoder() - let data = try Data(contentsOf: url) - return try decoder.decode(SuiteResult.self, from: data) - } -} diff --git a/Sources/RegexBenchmark/CLI.swift b/Sources/RegexBenchmark/CLI.swift index 8ef351329..77ebff47b 100644 --- a/Sources/RegexBenchmark/CLI.swift +++ b/Sources/RegexBenchmark/CLI.swift @@ -10,6 +10,9 @@ struct Runner: ParsableCommand { @Flag(help: "Debug benchmark regexes") var debug = false + + @Option(help: "Load results from this file instead of rerunning") + var load: String? @Option(help: "The file results should be saved to") var save: String? @@ -17,15 +20,56 @@ struct Runner: ParsableCommand { @Option(help: "The result file to compare against") var compare: String? + @Option(help: "Compare compile times with the given results file") + var compareCompileTime: String? + + @Flag(help: "Show comparison chart") + var showChart: Bool = false + + @Flag(help: "Compare with NSRegularExpression") + var compareWithNS: Bool = false + + @Option(help: "Save comparison results as csv") + var saveComparison: String? + @Flag(help: "Quiet mode") var quiet = false @Flag(help: "Exclude running NSRegex benchmarks") var excludeNs = false + + @Flag(help: """ +Enable tracing of the engine (warning: lots of output). Prints out processor state each cycle - mutating func run() throws { - var runner = BenchmarkRunner.makeRunner(samples, quiet) +Note: swift-experimental-string-processing must be built with processor measurements enabled +swift build -c release -Xswiftc -DPROCESSOR_MEASUREMENTS_ENABLED + +""") + var enableTracing: Bool = false + + @Flag(help: """ +Enable engine metrics (warning: lots of output). Prints out cycle count, instruction counts, number of backtracks +Note: swift-experimental-string-processing must be built with processor measurements enabled +swift build -c release -Xswiftc -DPROCESSOR_MEASUREMENTS_ENABLED + +""") + var enableMetrics: Bool = false + + @Flag(help: "Include firstMatch benchmarks in CrossBenchmark (off by default)") + var includeFirst: Bool = false + + mutating func run() throws { + var runner = BenchmarkRunner( + suiteName: "DefaultRegexSuite", + samples: samples, + quiet: quiet, + enableTracing: enableTracing, + enableMetrics: enableMetrics, + includeFirstOverride: includeFirst) + + runner.registerDefault() + if !self.specificBenchmarks.isEmpty { runner.suite = runner.suite.filter { b in specificBenchmarks.contains { pattern in @@ -35,17 +79,35 @@ struct Runner: ParsableCommand { } if debug { runner.debug() + return + } + + if let loadFile = load { + try runner.load(from: loadFile) } else { if excludeNs { runner.suite = runner.suite.filter { b in !b.name.contains("NS") } } runner.run() - if let compareFile = compare { - try runner.compare(against: compareFile) - } - if let saveFile = save { - try runner.save(to: saveFile) - } + } + if let saveFile = save { + try runner.save(to: saveFile) + } + if saveComparison != nil && compareWithNS && compare != nil { + print("Unable to save both comparison results, specify only one compare operation") + return + } + if compareWithNS { + try runner.compareWithNS(showChart: showChart, saveTo: saveComparison) + } + if let compareFile = compare { + try runner.compare( + against: compareFile, + showChart: showChart, + saveTo: saveComparison) + } + if let compareFile = compareCompileTime { + try runner.compareCompileTimes(against: compareFile, showChart: showChart) } } } diff --git a/Sources/RegexBenchmark/Debug.swift b/Sources/RegexBenchmark/Debug.swift index fcd11f7ca..1171247e4 100644 --- a/Sources/RegexBenchmark/Debug.swift +++ b/Sources/RegexBenchmark/Debug.swift @@ -1,12 +1,21 @@ import Foundation +protocol Debug { + func debug() +} + +extension Debug { + var maxStringLengthForPrint: Int { 1000 } + var maxMatchCountForPrint: Int { 100 } +} + extension Benchmark { func debug() { switch type { case .whole: let result = target.wholeMatch(of: regex) if let match = result { - if match.0.count > 100 { + if match.0.count > maxStringLengthForPrint { print("- Match: len = \(match.0.count)") } else { print("- Match: \(match.0)") @@ -22,7 +31,7 @@ extension Benchmark { } print("- Total matches: \(results.count)") - if results.count > 10 { + if results.count > maxMatchCountForPrint { print("# Too many matches, not printing") let avgLen = results.map({result in String(target[result.range]).count}) .reduce(0.0, {$0 + Double($1)}) / Double(results.count) @@ -32,7 +41,7 @@ extension Benchmark { } for match in results { - if match.0.count > 100 { + if match.0.count > maxStringLengthForPrint { print("- Match: len = \(match.0.count)") } else { print("- Match: \(match.0)") @@ -42,7 +51,7 @@ extension Benchmark { case .first: let result = target.firstMatch(of: regex) if let match = result { - if match.0.count > 100 { + if match.0.count > maxStringLengthForPrint { print("- Match: len = \(match.0.count)") } else { print("- Match: \(match.0)") @@ -56,6 +65,7 @@ extension Benchmark { } extension NSBenchmark { + func debug() { switch type { case .allMatches: @@ -66,13 +76,13 @@ extension NSBenchmark { } print("- Total matches: \(results.count)") - if results.count > 10 { + if results.count > maxMatchCountForPrint { print("# Too many matches, not printing") return } for m in results { - if m.range.length > 100 { + if m.range.length > maxStringLengthForPrint { print("- Match: len = \(m.range.length)") } else { print("- Match: \(target[Range(m.range, in: target)!])") @@ -81,7 +91,7 @@ extension NSBenchmark { case .first: let result = regex.firstMatch(in: target, range: range) if let match = result { - if match.range.length > 100 { + if match.range.length > maxStringLengthForPrint { print("- Match: len = \(match.range.length)") } else { print("- Match: \(target[Range(match.range, in: target)!])") diff --git a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift index 2f971b4e6..61d7b197f 100644 --- a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift +++ b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift @@ -15,42 +15,49 @@ extension BenchmarkRunner { register(Benchmark( name: "BasicCCC", regex: try! Regex(basic), + pattern: basic, type: .allMatches, target: input)) register(Benchmark( name: "BasicRangeCCC", regex: try! Regex(basicRange), + pattern: basicRange, type: .allMatches, target: input)) register(Benchmark( name: "CaseInsensitiveCCC", regex: try! Regex(caseInsensitive), + pattern: caseInsensitive, type: .allMatches, target: input)) register(Benchmark( name: "InvertedCCC", regex: try! Regex(inverted), + pattern: inverted, type: .allMatches, target: input)) register(Benchmark( name: "SubtractionCCC", regex: try! Regex(subtraction), + pattern: subtraction, type: .allMatches, target: input)) register(Benchmark( name: "IntersectionCCC", regex: try! Regex(intersection), + pattern: intersection, type: .allMatches, target: input)) register(Benchmark( name: "symDiffCCC", regex: try! Regex(symmetricDifference), + pattern: symmetricDifference, type: .allMatches, target: input)) } diff --git a/Sources/RegexBenchmark/Suite/LiteralSearch.swift b/Sources/RegexBenchmark/Suite/LiteralSearch.swift index 1f48f9945..32cf60a7d 100644 --- a/Sources/RegexBenchmark/Suite/LiteralSearch.swift +++ b/Sources/RegexBenchmark/Suite/LiteralSearch.swift @@ -3,7 +3,7 @@ import _StringProcessing extension BenchmarkRunner { mutating func addLiteralSearch() { let searchNotFound = CrossBenchmark(baseName: "LiteralSearchNotFound", regex: "magic_string_to_search_for", input: Inputs.graphemeBreakData) - let search = CrossBenchmark(baseName: "LiteralSearch", regex: "aatcgaagcagtcttctaacacccttagaaaagcaaacactattgaatactgccgccgca", input: Inputs.graphemeBreakData) + let search = CrossBenchmark(baseName: "LiteralSearch", regex: "HANGUL CHOSEONG TIKEUT-MIEUM..HANGUL CHOSEONG SSANGYEORINHIEUH", input: Inputs.graphemeBreakData) searchNotFound.register(&self) search.register(&self) } diff --git a/Sources/RegexBenchmark/Suite/Unicode.swift b/Sources/RegexBenchmark/Suite/Unicode.swift index 5944ab2ca..46afda712 100644 --- a/Sources/RegexBenchmark/Suite/Unicode.swift +++ b/Sources/RegexBenchmark/Suite/Unicode.swift @@ -4,11 +4,11 @@ extension BenchmarkRunner { mutating func addUnicode() { // tagged unicode: unicode characters surrounded by html tags // use the same html regex, uses backreference + reluctant quantification - let tags = #"<(\w*)\b[^>]*>(.*?)<\/\1>"# - let taggedEmojis = CrossBenchmark( - baseName: "TaggedEmojis", - regex: tags, - input: Inputs.taggedEmojis) +// let tags = #"<(\w*)\b[^>]*>(.*?)<\/\1>"# // disabled due to \b being unusably slow +// let taggedEmojis = CrossBenchmark( +// baseName: "TaggedEmojis", +// regex: tags, +// input: Inputs.taggedEmojis) // Now actually matching emojis let emoji = #"(๐Ÿ˜ƒ|๐Ÿ˜€|๐Ÿ˜ณ|๐Ÿ˜ฒ|๐Ÿ˜ฆ|๐Ÿ˜Š|๐Ÿ™Š|๐Ÿ˜˜|๐Ÿ˜|๐Ÿ˜ณ|๐Ÿ˜’){2,5}"# @@ -18,7 +18,7 @@ extension BenchmarkRunner { regex: emoji, input: Inputs.taggedEmojis) - // taggedEmojis.register(&self) // disabled due to \b being unusably slow + // taggedEmojis.register(&self) emojiRegex.register(&self) } } diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index c5c46eef9..0cc9156a4 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -3,10 +3,10 @@ import Foundation enum Stats {} extension Stats { - // 500ยตs, maybe this should be a % of the runtime for each benchmark? - static let maxAllowedStdev = 500e-6 + // Maximum allowed standard deviation is 5% of the median runtime + static let maxAllowedStdev = 0.05 - static func tTest(_ a: BenchmarkResult, _ b: BenchmarkResult) -> Bool { + static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test // Since we should generally have similar variances across runs let n1 = Double(a.samples) @@ -18,3 +18,9 @@ extension Stats { return abs(tVal) > 2 } } + +extension BenchmarkResult { + var runtimeIsTooVariant: Bool { + runtime.stdev > Stats.maxAllowedStdev * runtime.median.seconds + } +} diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 66fefc49e..a0f3b2a44 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -22,17 +22,19 @@ extension Compiler { /// This is used to determine whether to apply initial options. var hasEmittedFirstMatchableAtom = false - private let compileOptions: CompileOptions + private let compileOptions: _CompileOptions fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) } init( options: MatchingOptions, - compileOptions: CompileOptions, + compileOptions: _CompileOptions, captureList: CaptureList ) { self.options = options self.compileOptions = compileOptions self.builder.captureList = captureList + self.builder.enableTracing = compileOptions.contains(.enableTracing) + self.builder.enableMetrics = compileOptions.contains(.enableMetrics) } } } diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index b8daa8b21..34b0962d8 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -16,7 +16,7 @@ class Compiler { // TODO: Or are these stored on the tree? var options = MatchingOptions() - private var compileOptions: CompileOptions = .default + private var compileOptions: _CompileOptions = .default init(ast: AST) { self.tree = ast.dslTree @@ -26,7 +26,7 @@ class Compiler { self.tree = tree } - init(tree: DSLTree, compileOptions: CompileOptions) { + init(tree: DSLTree, compileOptions: _CompileOptions) { self.tree = tree self.compileOptions = compileOptions } @@ -107,10 +107,15 @@ func _compileRegex( return Executor(program: program) } -extension Compiler { - struct CompileOptions: OptionSet { - let rawValue: Int - static let disableOptimizations = CompileOptions(rawValue: 1) - static let `default`: CompileOptions = [] +@_spi(RegexBenchmark) +public struct _CompileOptions: OptionSet { + public let rawValue: Int + public init(rawValue: Int) { + self.rawValue = rawValue } + + public static let disableOptimizations = _CompileOptions(rawValue: 1 << 0) + public static let enableTracing = _CompileOptions(rawValue: 1 << 1) + public static let enableMetrics = _CompileOptions(rawValue: 1 << 2) + public static let `default`: _CompileOptions = [] } diff --git a/Sources/_StringProcessing/Engine/Consume.swift b/Sources/_StringProcessing/Engine/Consume.swift index 12f65a777..6af973919 100644 --- a/Sources/_StringProcessing/Engine/Consume.swift +++ b/Sources/_StringProcessing/Engine/Consume.swift @@ -21,7 +21,8 @@ extension Engine { subjectBounds: bounds, searchBounds: bounds, matchMode: matchMode, - isTracingEnabled: enableTracing) + isTracingEnabled: enableTracing, + shouldMeasureMetrics: enableMetrics) } func makeFirstMatchProcessor( @@ -35,7 +36,8 @@ extension Engine { subjectBounds: subjectBounds, searchBounds: searchBounds, matchMode: .partialFromFront, - isTracingEnabled: enableTracing) + isTracingEnabled: enableTracing, + shouldMeasureMetrics: enableMetrics) } } diff --git a/Sources/_StringProcessing/Engine/Engine.swift b/Sources/_StringProcessing/Engine/Engine.swift index 9e67e4639..a5cb11bd6 100644 --- a/Sources/_StringProcessing/Engine/Engine.swift +++ b/Sources/_StringProcessing/Engine/Engine.swift @@ -13,25 +13,16 @@ // But, we can play around with this. struct Engine { - var program: MEProgram + let program: MEProgram // TODO: Pre-allocated register banks var instructions: InstructionList { program.instructions } - var enableTracing: Bool { - get { program.enableTracing } - set { program.enableTracing = newValue } - } + var enableTracing: Bool { program.enableTracing } + var enableMetrics: Bool { program.enableMetrics } - init( - _ program: MEProgram, - enableTracing: Bool? = nil - ) { - var program = program - if let t = enableTracing { - program.enableTracing = t - } + init(_ program: MEProgram) { self.program = program } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 959b1507e..20885b8c8 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -14,6 +14,10 @@ extension MEProgram { struct Builder { var instructions: [Instruction] = [] + + // Tracing + var enableTracing = false + var enableMetrics = false var elements = TypedSetVector() var sequences = TypedSetVector<[Input.Element], _SequenceRegister>() @@ -378,6 +382,8 @@ extension MEProgram.Builder { staticTransformFunctions: transformFunctions, staticMatcherFunctions: matcherFunctions, registerInfo: regInfo, + enableTracing: enableTracing, + enableMetrics: enableMetrics, captureList: captureList, referencedCaptureOffsets: referencedCaptureOffsets, initialOptions: initialOptions) diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index bacefb209..67f5a8bc9 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -31,8 +31,9 @@ struct MEProgram { var registerInfo: RegisterInfo - var enableTracing: Bool = false - + var enableTracing: Bool + var enableMetrics: Bool + let captureList: CaptureList let referencedCaptureOffsets: [ReferenceID: Int] diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift new file mode 100644 index 000000000..753c3c3d1 --- /dev/null +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -0,0 +1,37 @@ +extension Processor { + struct ProcessorMetrics { + var instructionCounts: [Instruction.OpCode: Int] = [:] + var backtracks: Int = 0 + var resets: Int = 0 + } + + func printMetrics() { + print("===") + print("Total cycle count: \(cycleCount)") + print("Backtracks: \(metrics.backtracks)") + print("Resets: \(metrics.resets)") + print("Instructions:") + let sorted = metrics.instructionCounts + .filter({$0.1 != 0}) + .sorted(by: { (a,b) in a.1 > b.1 }) + for (opcode, count) in sorted { + print("> \(opcode): \(count)") + } + print("===") + } + + mutating func measure() { + let (opcode, _) = fetch().destructure + if metrics.instructionCounts.keys.contains(opcode) { + metrics.instructionCounts[opcode]! += 1 + } else { + metrics.instructionCounts.updateValue(1, forKey: opcode) + } + } + + mutating func measureMetrics() { + if shouldMeasureMetrics { + measure() + } + } +} diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a62c1e070..66dcb9dbb 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -89,6 +89,8 @@ struct Processor { // MARK: Metrics, debugging, etc. var cycleCount = 0 var isTracingEnabled: Bool + let shouldMeasureMetrics: Bool + var metrics: ProcessorMetrics = ProcessorMetrics() } extension Processor { @@ -105,7 +107,8 @@ extension Processor { subjectBounds: Range, searchBounds: Range, matchMode: MatchMode, - isTracingEnabled: Bool + isTracingEnabled: Bool, + shouldMeasureMetrics: Bool ) { self.controller = Controller(pc: 0) self.instructions = program.instructions @@ -114,6 +117,7 @@ extension Processor { self.searchBounds = searchBounds self.matchMode = matchMode self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds @@ -140,7 +144,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - + + if shouldMeasureMetrics { metrics.resets += 1 } _checkInvariants() } @@ -356,6 +361,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters + + if shouldMeasureMetrics { metrics.backtracks += 1 } } mutating func abort(_ e: Error? = nil) { @@ -393,12 +400,20 @@ extension Processor { mutating func cycle() { _checkInvariants() assert(state == .inProgress) - if cycleCount == 0 { trace() } + +#if PROCESSOR_MEASUREMENTS_ENABLED + if cycleCount == 0 { + trace() + measureMetrics() + } defer { cycleCount += 1 trace() + measureMetrics() _checkInvariants() } +#endif + let (opcode, payload) = fetch().destructure switch opcode { case .invalid: diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index cbb065fc1..725319b00 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -29,27 +29,81 @@ extension Processor: TracedProcessor { extension Instruction: CustomStringConvertible { var description: String { - // TODO: opcode specific rendering - "\(opcode) \(payload)" - } -} - -extension Instruction.Payload: CustomStringConvertible { - var description: String { -// var result = "" -// if hasCondition { -// result += "\(condition) " -// } -// if hasPayload { -// let payload: TypedInt<_Boo> = payload() -// result += payload.description -// } -// return result - - // TODO: Without bit packing our representation, what - // should we do? I'd say a payload cannot be printed - // in isolation of the instruction... - return "\(rawValue)" + switch opcode { + case .advance: + return "\(opcode) \(payload.distance)" + case .assertBy: + return "\(opcode) \(payload.assertion)" + case .backreference: + return "\(opcode) \(payload.capture.rawValue)" + case .beginCapture: + return "\(opcode) \(payload.capture.rawValue)" + case .branch: + return "\(opcode) \(payload.addr)" + case .captureValue: + let (val, cap) = payload.pairedValueCapture + return "\(opcode) vals[\(val)] -> captures[\(cap)]" + case .condBranchSamePosition: + let (addr, pos) = payload.pairedAddrPos + return "\(opcode) \(addr) pos[\(pos)]" + case .condBranchZeroElseDecrement: + let (addr, int) = payload.pairedAddrInt + return "\(opcode) \(addr) int[\(int)]" + case .consumeBy: + return "\(opcode) consumer[\(payload.consumer)]" + case .endCapture: + return "\(opcode) \(payload.capture.rawValue)" + case .match: + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + return "matchCaseInsensitive char[\(reg)]" + } else { + return "match char[\(reg)]" + } + case .matchBitset: + let (isScalar, reg) = payload.bitsetPayload + if isScalar { + return "matchBitsetScalar bitset[\(reg)]" + } else { + return "matchBitset bitset[\(reg)]" + } + case .matchBuiltin: + let payload = payload.characterClassPayload + return "matchBuiltin \(payload.cc) (\(payload.isInverted))" + case .matchBy: + let (matcherReg, valReg) = payload.pairedMatcherValue + return "\(opcode) match[\(matcherReg)] -> val[\(valReg)]" + case .matchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + return "matchScalarCaseInsensitive '\(scalar)' boundaryCheck: \(boundaryCheck)" + } else { + return "matchScalar '\(scalar)' boundaryCheck: \(boundaryCheck)" + } + case .moveCurrentPosition: + let reg = payload.position + return "\(opcode) -> pos[\(reg)]" + case .moveImmediate: + let (imm, reg) = payload.pairedImmediateInt + return "\(opcode) \(imm) -> int[\(reg)]" + case .quantify: + let payload = payload.quantify + return "\(opcode) \(payload.type) \(payload.minTrips) \(payload.extraTrips?.description ?? "unbounded" )" + case .save: + let resumeAddr = payload.addr + return "\(opcode) \(resumeAddr)" + case .saveAddress: + let resumeAddr = payload.addr + return "\(opcode) \(resumeAddr)" + case .splitSaving: + let (nextPC, resumeAddr) = payload.pairedAddrAddr + return "\(opcode) saving: \(resumeAddr) jumpingTo: \(nextPC)" + case .transformCapture: + let (cap, trans) = payload.pairedCaptureTransform + return "\(opcode) trans[\(trans)](\(cap))" + default: + return "\(opcode)" + } } } @@ -62,7 +116,9 @@ extension Processor.SavePoint { if rangeIsEmpty { posStr = "" } else { - posStr = "\(rangeStart!...rangeEnd!)" + let startStr = "\(input.distance(from: input.startIndex, to: rangeStart!))" + let endStr = "\(input.distance(from: input.startIndex, to: rangeEnd!))" + posStr = "\(startStr)...\(endStr)" } } return """ diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 718d37026..253858d1f 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -15,8 +15,8 @@ struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine - init(program: MEProgram, enablesTracing: Bool = false) { - self.engine = Engine(program, enableTracing: enablesTracing) + init(program: MEProgram) { + self.engine = Engine(program) } @available(SwiftStdlib 5.7, *) @@ -30,7 +30,9 @@ struct Executor { input: input, subjectBounds: subjectBounds, searchBounds: searchBounds) - +#if PROCESSOR_MEASUREMENTS_ENABLED + defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } +#endif var low = searchBounds.lowerBound let high = searchBounds.upperBound while true { @@ -57,6 +59,9 @@ struct Executor { ) throws -> Regex.Match? { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) +#if PROCESSOR_MEASUREMENTS_ENABLED + defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } +#endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 0afe11c77..28e64a6e2 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -82,7 +82,7 @@ extension Regex { let tree: DSLTree /// OptionSet of compiler options for testing purposes - fileprivate var compileOptions: Compiler.CompileOptions = .default + fileprivate var compileOptions: _CompileOptions = .default private final class ProgramBox { let value: MEProgram @@ -136,9 +136,30 @@ extension Regex { } @available(SwiftStdlib 5.7, *) +@_spi(RegexBenchmark) extension Regex { - internal mutating func _setCompilerOptionsForTesting(_ opts: Compiler.CompileOptions) { - program.compileOptions = opts - program._loweredProgramStorage = nil + public enum _RegexInternalAction { + case recompile + case addOptions(_CompileOptions) + } + + /// Internal API for RegexBenchmark + /// Forces the regex to perform the given action, returning if it was successful + public mutating func _forceAction(_ action: _RegexInternalAction) -> Bool { + do { + switch action { + case .addOptions(let opts): + program.compileOptions.insert(opts) + program._loweredProgramStorage = nil + return true + case .recompile: + let _ = try Compiler( + tree: program.tree, + compileOptions: program.compileOptions).emit() + return true + } + } catch { + return false + } } } diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 5ae7cd245..112a601b1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -80,6 +80,21 @@ extension TracedProcessor { } func formatInput() -> String { + let distanceFromStart = input.distance( + from: input.startIndex, + to: currentPosition) + + // Cut a reasonably sized substring from the input to print + let start = input.index( + currentPosition, + offsetBy: -30, + limitedBy: input.startIndex) ?? input.startIndex + let end = input.index( + currentPosition, + offsetBy: 30, + limitedBy: input.endIndex) ?? input.endIndex + let input = input[start..