From 5e14a1d5879bbcae436c597d2841fae10df8f69e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 22:37:20 +0000
Subject: [PATCH 1/6] Add tests for run_benchmarks.rb helper methods

---
 Rakefile                                |  14 +
 lib/benchmark_runner.rb                 | 158 +++++++++
 run_benchmarks.rb                       | 131 +------
 test/benchmark_runner_test.rb           | 442 ++++++++++++++++++++++++
 test/run_benchmarks_integration_test.rb | 123 +++++++
 5 files changed, 745 insertions(+), 123 deletions(-)
 create mode 100644 Rakefile
 create mode 100644 lib/benchmark_runner.rb
 create mode 100644 test/benchmark_runner_test.rb
 create mode 100644 test/run_benchmarks_integration_test.rb

diff --git a/Rakefile b/Rakefile
new file mode 100644
index 00000000..b269374e
--- /dev/null
+++ b/Rakefile
@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+
+require 'rake/testtask'
+
+desc 'Run all tests'
+Rake::TestTask.new(:test) do |t|
+  t.libs << 'test'
+  t.libs << 'lib'
+  t.test_files = FileList['test/**/*_test.rb']
+  t.verbose = true
+  t.warning = true
+end
+
+task default: :test
diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
new file mode 100644
index 00000000..6e350e86
--- /dev/null
+++ b/lib/benchmark_runner.rb
@@ -0,0 +1,158 @@
+# frozen_string_literal: true
+
+require 'csv'
+require 'json'
+require 'rbconfig'
+
+# Extracted helper methods from run_benchmarks.rb for testing
+module BenchmarkRunner
+  module_function
+
+  # Format benchmark data as a string table
+  def table_to_str(table_data, format, failures)
+    # Trim numbers to one decimal for console display
+    # Keep two decimals for the speedup ratios
+
+    failure_rows = failures.map { |_exe, data| data.keys }.flatten.uniq
+                           .map { |name| [name] + (['N/A'] * (table_data.first.size - 1)) }
+
+    table_data = table_data.first(1) + failure_rows + table_data.drop(1).map { |row|
+      format.zip(row).map { |fmt, data| fmt % data }
+    }
+
+    num_rows = table_data.length
+    num_cols = table_data[0].length
+
+    # Pad each column to the maximum width in the column
+    (0...num_cols).each do |c|
+      cell_lens = (0...num_rows).map { |r| table_data[r][c].length }
+      max_width = cell_lens.max
+      (0...num_rows).each { |r| table_data[r][c] = table_data[r][c].ljust(max_width) }
+    end
+
+    # Row of separator dashes
+    sep_row = (0...num_cols).map { |i| '-' * table_data[0][i].length }.join('  ').rstrip
+
+    out = sep_row + "\n"
+
+    table_data.each do |row|
+      out += row.join('  ').rstrip + "\n"
+    end
+
+    out += sep_row + "\n"
+
+    out
+  end
+
+  # Find the first available file number for output files
+  def free_file_no(prefix)
+    (1..).each do |file_no|
+      out_path = File.join(prefix, "output_%03d.csv" % file_no)
+      return file_no unless File.exist?(out_path)
+    end
+  end
+
+  # Get benchmark categories from metadata
+  def benchmark_categories(name, metadata)
+    benchmark_metadata = metadata.find { |benchmark, _metadata| benchmark == name }&.last || {}
+    categories = [benchmark_metadata.fetch('category', 'other')]
+    categories << 'ractor' if benchmark_metadata['ractor']
+    categories
+  end
+
+  # Check if the name matches any of the names in a list of filters
+  def match_filter(entry, categories:, name_filters:, metadata:)
+    name_filters = process_name_filters(name_filters)
+    name = entry.sub(/\.rb\z/, '')
+    (categories.empty? || benchmark_categories(name, metadata).any? { |cat| categories.include?(cat) }) &&
+      (name_filters.empty? || name_filters.any? { |filter| filter === name })
+  end
+
+  # Process "/my_benchmark/i" into /my_benchmark/i
+  def process_name_filters(name_filters)
+    name_filters.map do |name_filter|
+      if name_filter[0] == "/"
+        regexp_str = name_filter[1..-1].reverse.sub(/\A(\w*)\//, "")
+        regexp_opts = ::Regexp.last_match(1).to_s
+        regexp_str.reverse!
+        r = /#{regexp_str}/
+        if !regexp_opts.empty?
+          # Convert option string to Regexp option flags
+          flags = 0
+          flags |= Regexp::IGNORECASE if regexp_opts.include?('i')
+          flags |= Regexp::MULTILINE if regexp_opts.include?('m')
+          flags |= Regexp::EXTENDED if regexp_opts.include?('x')
+          r = Regexp.new(regexp_str, flags)
+        end
+        r
+      else
+        name_filter
+      end
+    end
+  end
+
+  # Resolve the pre_init file path into a form that can be required
+  def expand_pre_init(path)
+    require 'pathname'
+
+    path = Pathname.new(path)
+
+    unless path.exist?
+      puts "--with-pre-init called with non-existent file!"
+      exit(-1)
+    end
+
+    if path.directory?
+      puts "--with-pre-init called with a directory, please pass a .rb file"
+      exit(-1)
+    end
+
+    library_name = path.basename(path.extname)
+    load_path = path.parent.expand_path
+
+    [
+      "-I", load_path,
+      "-r", library_name
+    ]
+  end
+
+  # Sort benchmarks with headlines first, then others, then micro
+  def sort_benchmarks(bench_names, metadata)
+    headline_benchmarks = metadata.select { |_, meta| meta['category'] == 'headline' }.keys
+    micro_benchmarks = metadata.select { |_, meta| meta['category'] == 'micro' }.keys
+
+    headline_names, bench_names = bench_names.partition { |name| headline_benchmarks.include?(name) }
+    micro_names, other_names = bench_names.partition { |name| micro_benchmarks.include?(name) }
+    headline_names.sort + other_names.sort + micro_names.sort
+  end
+
+  # Check which OS we are running
+  def os
+    @os ||= (
+      host_os = RbConfig::CONFIG['host_os']
+      case host_os
+      when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
+        :windows
+      when /darwin|mac os/
+        :macosx
+      when /linux/
+        :linux
+      when /solaris|bsd/
+        :unix
+      else
+        raise "unknown os: #{host_os.inspect}"
+      end
+    )
+  end
+
+  # Generate setarch prefix for Linux
+  def setarch_prefix
+    # Disable address space randomization (for determinism)
+    prefix = ["setarch", `uname -m`.strip, "-R"]
+
+    # Abort if we don't have permission (perhaps in a docker container).
+    return [] unless system(*prefix, "true", out: File::NULL, err: File::NULL)
+
+    prefix
+  end
+end
diff --git a/run_benchmarks.rb b/run_benchmarks.rb
index 58b6d2ae..2e0b5bbc 100755
--- a/run_benchmarks.rb
+++ b/run_benchmarks.rb
@@ -11,24 +11,11 @@
 require 'etc'
 require 'yaml'
 require_relative 'misc/stats'
+require_relative 'lib/benchmark_runner'
 
 # Check which OS we are running
 def os
-  @os ||= (
-    host_os = RbConfig::CONFIG['host_os']
-    case host_os
-    when /mswin|msys|mingw|cygwin|bccwin|wince|emc/
-      :windows
-    when /darwin|mac os/
-      :macosx
-    when /linux/
-      :linux
-    when /solaris|bsd/
-      :unix
-    else
-      raise "unknown os: #{host_os.inspect}"
-    end
-  )
+  BenchmarkRunner.os
 end
 
 # Checked system - error or return info if the command fails
@@ -123,38 +110,7 @@ def performance_governor?
 end
 
 def table_to_str(table_data, format, failures)
-  # Trim numbers to one decimal for console display
-  # Keep two decimals for the speedup ratios
-
-  failure_rows = failures.map { |_exe, data| data.keys }.flatten.uniq
-                         .map { |name| [name] + (['N/A'] * (table_data.first.size - 1)) }
-
-  table_data = table_data.first(1) + failure_rows + table_data.drop(1).map { |row|
-    format.zip(row).map { |fmt, data| fmt % data }
-  }
-
-  num_rows = table_data.length
-  num_cols = table_data[0].length
-
-  # Pad each column to the maximum width in the column
-  (0...num_cols).each do |c|
-    cell_lens = (0...num_rows).map { |r| table_data[r][c].length }
-    max_width = cell_lens.max
-    (0...num_rows).each { |r| table_data[r][c] = table_data[r][c].ljust(max_width) }
-  end
-
-  # Row of separator dashes
-  sep_row = (0...num_cols).map { |i| '-' * table_data[0][i].length }.join('  ')
-
-  out = sep_row + "\n"
-
-  table_data.each do |row|
-    out += row.join('  ') + "\n"
-  end
-
-  out += sep_row
-
-  return out
+  BenchmarkRunner.table_to_str(table_data, format, failures)
 end
 
 def mean(values)
@@ -165,69 +121,9 @@ def stddev(values)
   Stats.new(values).stddev
 end
 
-def free_file_no(prefix)
-  (1..).each do |file_no|
-    out_path = File.join(prefix, "output_%03d.csv" % file_no)
-    if !File.exist?(out_path)
-      return file_no
-    end
-  end
-end
-
-def benchmark_categories(name)
-  metadata = benchmarks_metadata.find { |benchmark, _metadata| benchmark == name }&.last || {}
-  categories = [metadata.fetch('category', 'other')]
-  categories << 'ractor' if metadata['ractor']
-  categories
-end
-
 # Check if the name matches any of the names in a list of filters
 def match_filter(entry, categories:, name_filters:)
-  name_filters = process_name_filters(name_filters)
-  name = entry.sub(/\.rb\z/, '')
-  (categories.empty? || benchmark_categories(name).any? { |cat| categories.include?(cat) }) &&
-    (name_filters.empty? || name_filters.any? { |filter| filter === name })
-end
-
-# process "/my_benchmark/i" into /my_benchmark/i
-def process_name_filters(name_filters)
-  name_filters.map do |name_filter|
-    if name_filter[0] == "/"
-      regexp_str = name_filter[1..-1].reverse.sub(/\A(\w*)\//, "")
-      regexp_opts = $1.to_s
-      regexp_str.reverse!
-      r = /#{regexp_str}/
-      if !regexp_opts.empty?
-        r = Regexp.compile(r.to_s, regexp_opts)
-      end
-      r
-    else
-      name_filter
-    end
-  end
-end
-
-# Resolve the pre_init file path into a form that can be required
-def expand_pre_init(path)
-  path = Pathname.new(path)
-
-  unless path.exist?
-    puts "--with-pre-init called with non-existent file!"
-    exit(-1)
-  end
-
-  if path.directory?
-    puts "--with-pre-init called with a directory, please pass a .rb file"
-    exit(-1)
-  end
-
-  library_name = path.basename(path.extname)
-  load_path = path.parent.expand_path
-
-  [
-    "-I", load_path,
-    "-r", library_name
-  ]
+  BenchmarkRunner.match_filter(entry, categories: categories, name_filters: name_filters, metadata: benchmarks_metadata)
 end
 
 def benchmarks_metadata
@@ -235,22 +131,11 @@ def benchmarks_metadata
 end
 
 def sort_benchmarks(bench_names)
-  headline_benchmarks = benchmarks_metadata.select { |_, metadata| metadata['category'] == 'headline' }.keys
-  micro_benchmarks = benchmarks_metadata.select { |_, metadata| metadata['category'] == 'micro' }.keys
-
-  headline_names, bench_names = bench_names.partition { |name| headline_benchmarks.include?(name) }
-  micro_names, other_names = bench_names.partition { |name| micro_benchmarks.include?(name) }
-  headline_names.sort + other_names.sort + micro_names.sort
+  BenchmarkRunner.sort_benchmarks(bench_names, benchmarks_metadata)
 end
 
 def setarch_prefix
-  # Disable address space randomization (for determinism)
-  prefix = ["setarch", `uname -m`.strip, "-R"]
-
-  # Abort if we don't have permission (perhaps in a docker container).
-  return [] unless system(*prefix, "true")
-
-  prefix
+  BenchmarkRunner.setarch_prefix
 end
 
 # Run all the benchmarks and record execution times
@@ -284,7 +169,7 @@ def run_benchmarks(ruby:, ruby_description:, categories:, name_filters:, out_pat
   end
 
   if pre_init
-    pre_init = expand_pre_init(pre_init)
+    pre_init = BenchmarkRunner.expand_pre_init(pre_init)
   end
 
 
@@ -603,7 +488,7 @@ def run_benchmarks(ruby:, ruby_description:, categories:, name_filters:, out_pat
   output_path = args.out_override
 else
   # If no out path is specified, find a free file index for the output files
-  file_no = free_file_no(args.out_path)
+  file_no = BenchmarkRunner.free_file_no(args.out_path)
   output_path = File.join(args.out_path, "output_%03d" % file_no)
 end
 
diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb
new file mode 100644
index 00000000..a9265801
--- /dev/null
+++ b/test/benchmark_runner_test.rb
@@ -0,0 +1,442 @@
+require_relative 'test_helper'
+require_relative '../lib/benchmark_runner'
+require_relative '../misc/stats'
+require 'tempfile'
+require 'tmpdir'
+require 'fileutils'
+require 'json'
+require 'csv'
+require 'yaml'
+
+describe BenchmarkRunner do
+  describe '.table_to_str' do
+    it 'formats a simple table correctly' do
+      table_data = [
+        ['bench', 'time (ms)', 'stddev (%)'],
+        ['fib', '100.5', '2.3'],
+        ['loop', '50.2', '1.1']
+      ]
+      format = ['%s', '%s', '%s']
+      failures = {}
+
+      result = BenchmarkRunner.table_to_str(table_data, format, failures)
+
+      assert_equal <<~TABLE, result
+        -----  ---------  ----------
+        bench  time (ms)  stddev (%)
+        fib    100.5      2.3
+        loop   50.2       1.1
+        -----  ---------  ----------
+      TABLE
+    end
+
+    it 'includes failure rows when failures are present' do
+      table_data = [
+        ['bench', 'time (ms)'],
+        ['fib', '100.5']
+      ]
+      format = ['%s', '%s']
+      failures = { 'ruby' => { 'broken_bench' => 1 } }
+
+      result = BenchmarkRunner.table_to_str(table_data, format, failures)
+
+      assert_equal <<~TABLE, result
+        ------------  ---------
+        bench         time (ms)
+        broken_bench  N/A
+        fib           100.5
+        ------------  ---------
+      TABLE
+    end
+
+    it 'handles empty failures hash' do
+      table_data = [['bench'], ['fib']]
+      format = ['%s']
+      failures = {}
+
+      result = BenchmarkRunner.table_to_str(table_data, format, failures)
+
+      assert_equal <<~TABLE, result
+        -----
+        bench
+        fib
+        -----
+      TABLE
+    end
+  end
+
+  describe '.free_file_no' do
+    it 'returns 1 when no files exist' do
+      Dir.mktmpdir do |dir|
+        file_no = BenchmarkRunner.free_file_no(dir)
+        assert_equal 1, file_no
+      end
+    end
+
+    it 'returns next available number when files exist' do
+      Dir.mktmpdir do |dir|
+        FileUtils.touch(File.join(dir, 'output_001.csv'))
+        FileUtils.touch(File.join(dir, 'output_002.csv'))
+
+        file_no = BenchmarkRunner.free_file_no(dir)
+        assert_equal 3, file_no
+      end
+    end
+
+    it 'finds first gap in numbering' do
+      Dir.mktmpdir do |dir|
+        FileUtils.touch(File.join(dir, 'output_001.csv'))
+        FileUtils.touch(File.join(dir, 'output_003.csv'))
+
+        file_no = BenchmarkRunner.free_file_no(dir)
+        assert_equal 2, file_no
+      end
+    end
+
+    it 'handles triple digit numbers' do
+      Dir.mktmpdir do |dir|
+        (1..100).each do |i|
+          FileUtils.touch(File.join(dir, 'output_%03d.csv' % i))
+        end
+
+        file_no = BenchmarkRunner.free_file_no(dir)
+        assert_equal 101, file_no
+      end
+    end
+  end
+
+  describe '.benchmark_categories' do
+    before do
+      @metadata = {
+        'fib' => { 'category' => 'micro' },
+        'railsbench' => { 'category' => 'headline' },
+        'optcarrot' => { 'category' => 'headline' },
+        'ractor_bench' => { 'category' => 'other', 'ractor' => true },
+        'unknown_bench' => {}
+      }
+    end
+
+    it 'returns category from metadata' do
+      result = BenchmarkRunner.benchmark_categories('fib', @metadata)
+      assert_equal ['micro'], result
+    end
+
+    it 'includes ractor category when ractor metadata is true' do
+      result = BenchmarkRunner.benchmark_categories('ractor_bench', @metadata)
+      assert_includes result, 'other'
+      assert_includes result, 'ractor'
+      assert_equal 2, result.size
+    end
+
+    it 'returns other as default category' do
+      result = BenchmarkRunner.benchmark_categories('unknown_bench', @metadata)
+      assert_equal ['other'], result
+    end
+
+    it 'returns other for completely missing benchmark' do
+      result = BenchmarkRunner.benchmark_categories('nonexistent', @metadata)
+      assert_equal ['other'], result
+    end
+
+    it 'handles headline benchmarks' do
+      result = BenchmarkRunner.benchmark_categories('railsbench', @metadata)
+      assert_equal ['headline'], result
+    end
+  end
+
+  describe '.match_filter' do
+    before do
+      @metadata = {
+        'fib' => { 'category' => 'micro' },
+        'railsbench' => { 'category' => 'headline' },
+        'optcarrot' => { 'category' => 'headline' },
+        'ractor_bench' => { 'category' => 'other', 'ractor' => true }
+      }
+    end
+
+    it 'matches when no filters provided' do
+      result = BenchmarkRunner.match_filter('fib.rb', categories: [], name_filters: [], metadata: @metadata)
+      assert_equal true, result
+    end
+
+    it 'matches by category' do
+      result = BenchmarkRunner.match_filter('fib.rb', categories: ['micro'], name_filters: [], metadata: @metadata)
+      assert_equal true, result
+
+      result = BenchmarkRunner.match_filter('fib.rb', categories: ['headline'], name_filters: [], metadata: @metadata)
+      assert_equal false, result
+    end
+
+    it 'matches by name filter' do
+      result = BenchmarkRunner.match_filter('fib.rb', categories: [], name_filters: ['fib'], metadata: @metadata)
+      assert_equal true, result
+
+      result = BenchmarkRunner.match_filter('fib.rb', categories: [], name_filters: ['rails'], metadata: @metadata)
+      assert_equal false, result
+    end
+
+    it 'matches ractor category' do
+      result = BenchmarkRunner.match_filter('ractor_bench.rb', categories: ['ractor'], name_filters: [], metadata: @metadata)
+      assert_equal true, result
+    end
+
+    it 'strips .rb extension from entry name' do
+      result = BenchmarkRunner.match_filter('fib.rb', categories: [], name_filters: ['fib'], metadata: @metadata)
+      assert_equal true, result
+    end
+
+    it 'handles regex filters' do
+      result = BenchmarkRunner.match_filter('railsbench.rb', categories: [], name_filters: ['/rails/'], metadata: @metadata)
+      assert_equal true, result
+
+      result = BenchmarkRunner.match_filter('fib.rb', categories: [], name_filters: ['/rails/'], metadata: @metadata)
+      assert_equal false, result
+    end
+
+    it 'handles case-insensitive regex filters' do
+      result = BenchmarkRunner.match_filter('railsbench.rb', categories: [], name_filters: ['/rails/i'], metadata: @metadata)
+      assert_equal true, result
+    end
+
+    it 'handles multiple categories' do
+      result = BenchmarkRunner.match_filter('fib.rb', categories: ['micro', 'headline'], name_filters: [], metadata: @metadata)
+      assert_equal true, result
+
+      result = BenchmarkRunner.match_filter('railsbench.rb', categories: ['micro', 'headline'], name_filters: [], metadata: @metadata)
+      assert_equal true, result
+    end
+  end
+
+  describe '.process_name_filters' do
+    it 'returns string filters unchanged' do
+      filters = ['fib', 'rails']
+      result = BenchmarkRunner.process_name_filters(filters)
+      assert_equal filters, result
+    end
+
+    it 'converts regex strings to Regexp objects' do
+      filters = ['/fib/']
+      result = BenchmarkRunner.process_name_filters(filters)
+      assert_equal 1, result.length
+      assert_instance_of Regexp, result[0]
+      refute_nil (result[0] =~ 'fib')
+    end
+
+    it 'handles regex with flags' do
+      filters = ['/FIB/i']
+      result = BenchmarkRunner.process_name_filters(filters)
+      refute_nil (result[0] =~ 'fib')
+      refute_nil (result[0] =~ 'FIB')
+    end
+
+    it 'handles mixed filters' do
+      filters = ['fib', '/rails/', 'optcarrot']
+      result = BenchmarkRunner.process_name_filters(filters)
+      assert_equal 3, result.length
+      assert_equal 'fib', result[0]
+      assert_instance_of Regexp, result[1]
+      assert_equal 'optcarrot', result[2]
+    end
+
+    it 'handles complex regex patterns' do
+      filters = ['/opt.*rot/']
+      result = BenchmarkRunner.process_name_filters(filters)
+      refute_nil (result[0] =~ 'optcarrot')
+      assert_nil (result[0] =~ 'fib')
+    end
+
+    it 'handles empty filter list' do
+      filters = []
+      result = BenchmarkRunner.process_name_filters(filters)
+      assert_equal [], result
+    end
+  end
+
+  describe '.expand_pre_init' do
+    it 'returns load path and require options for valid file' do
+      Dir.mktmpdir do |dir|
+        file = File.join(dir, 'pre_init.rb')
+        FileUtils.touch(file)
+
+        result = BenchmarkRunner.expand_pre_init(file)
+
+        assert_equal 4, result.length
+        assert_equal '-I', result[0]
+        assert_equal dir, result[1].to_s
+        assert_equal '-r', result[2]
+        assert_equal 'pre_init', result[3].to_s
+      end
+    end
+
+    it 'handles files with different extensions' do
+      Dir.mktmpdir do |dir|
+        file = File.join(dir, 'my_config.rb')
+        FileUtils.touch(file)
+
+        result = BenchmarkRunner.expand_pre_init(file)
+
+        assert_equal 'my_config', result[3].to_s
+      end
+    end
+
+    it 'handles nested directories' do
+      Dir.mktmpdir do |dir|
+        subdir = File.join(dir, 'config', 'initializers')
+        FileUtils.mkdir_p(subdir)
+        file = File.join(subdir, 'setup.rb')
+        FileUtils.touch(file)
+
+        result = BenchmarkRunner.expand_pre_init(file)
+
+        assert_equal subdir, result[1].to_s
+        assert_equal 'setup', result[3].to_s
+      end
+    end
+
+    it 'exits when file does not exist' do
+      assert_raises(SystemExit) { BenchmarkRunner.expand_pre_init('/nonexistent/file.rb') }
+    end
+
+    it 'exits when path is a directory' do
+      Dir.mktmpdir do |dir|
+        assert_raises(SystemExit) { BenchmarkRunner.expand_pre_init(dir) }
+      end
+    end
+  end
+
+  describe '.sort_benchmarks' do
+    before do
+      @metadata = {
+        'fib' => { 'category' => 'micro' },
+        'railsbench' => { 'category' => 'headline' },
+        'optcarrot' => { 'category' => 'headline' },
+        'some_bench' => { 'category' => 'other' },
+        'another_bench' => { 'category' => 'other' },
+        'zebra' => { 'category' => 'other' }
+      }
+    end
+
+    it 'sorts benchmarks with headlines first, then others, then micro' do
+      bench_names = ['fib', 'some_bench', 'railsbench', 'another_bench', 'optcarrot']
+      result = BenchmarkRunner.sort_benchmarks(bench_names, @metadata)
+
+      # Headlines should be first
+      headline_indices = [result.index('railsbench'), result.index('optcarrot')]
+      assert_equal true, headline_indices.all? { |i| i < 2 }
+
+      # Micro should be last
+      assert_equal 'fib', result.last
+
+      # Others in the middle
+      other_indices = [result.index('some_bench'), result.index('another_bench')]
+      assert_equal true, other_indices.all? { |i| i >= 2 && i < result.length - 1 }
+    end
+
+    it 'sorts alphabetically within categories' do
+      bench_names = ['zebra', 'another_bench', 'some_bench']
+      result = BenchmarkRunner.sort_benchmarks(bench_names, @metadata)
+      assert_equal ['another_bench', 'some_bench', 'zebra'], result
+    end
+
+    it 'handles empty list' do
+      result = BenchmarkRunner.sort_benchmarks([], @metadata)
+      assert_equal [], result
+    end
+
+    it 'handles single benchmark' do
+      result = BenchmarkRunner.sort_benchmarks(['fib'], @metadata)
+      assert_equal ['fib'], result
+    end
+
+    it 'handles only headline benchmarks' do
+      bench_names = ['railsbench', 'optcarrot']
+      result = BenchmarkRunner.sort_benchmarks(bench_names, @metadata)
+      assert_equal ['optcarrot', 'railsbench'], result
+    end
+  end
+
+  describe '.os' do
+    it 'detects the operating system' do
+      result = BenchmarkRunner.os
+      assert_includes [:linux, :macosx, :windows, :unix], result
+    end
+
+    it 'caches the os result' do
+      first_call = BenchmarkRunner.os
+      second_call = BenchmarkRunner.os
+      assert_equal second_call, first_call
+    end
+
+    it 'returns a symbol' do
+      result = BenchmarkRunner.os
+      assert_instance_of Symbol, result
+    end
+  end
+
+  describe '.setarch_prefix' do
+    it 'returns an array' do
+      result = BenchmarkRunner.setarch_prefix
+      assert_instance_of Array, result
+    end
+
+    it 'returns setarch command on Linux with proper permissions' do
+      skip 'Not on Linux' unless BenchmarkRunner.os == :linux
+
+      prefix = BenchmarkRunner.setarch_prefix
+
+      # Should either return the prefix or empty array if no permission
+      assert_includes [0, 3], prefix.length
+
+      if prefix.length == 3
+        assert_equal 'setarch', prefix[0]
+        assert_equal '-R', prefix[2]
+      end
+    end
+
+    it 'returns empty array when setarch fails' do
+      skip 'Test requires Linux' unless BenchmarkRunner.os == :linux
+
+      # If we don't have permissions, it should return empty array
+      prefix = BenchmarkRunner.setarch_prefix
+      if prefix.empty?
+        assert_equal [], prefix
+      else
+        # If we do have permissions, verify the structure
+        assert_equal 3, prefix.length
+      end
+    end
+  end
+
+  describe 'Stats integration' do
+    it 'calculates mean correctly' do
+      values = [1, 2, 3, 4, 5]
+      assert_equal 3.0, Stats.new(values).mean
+    end
+
+    it 'calculates stddev correctly' do
+      values = [2, 4, 4, 4, 5, 5, 7, 9]
+      result = Stats.new(values).stddev
+      assert_in_delta 2.0, result, 0.1
+    end
+  end
+
+  describe 'output file format' do
+    it 'generates correct output file number format' do
+      Dir.mktmpdir do |dir|
+        file_no = 1
+        expected_path = File.join(dir, "output_001.csv")
+
+        assert_equal expected_path, File.join(dir, "output_%03d.csv" % file_no)
+      end
+    end
+
+    it 'handles triple digit file numbers' do
+      Dir.mktmpdir do |dir|
+        file_no = 123
+        expected_path = File.join(dir, "output_123.csv")
+
+        assert_equal expected_path, File.join(dir, "output_%03d.csv" % file_no)
+      end
+    end
+  end
+end
diff --git a/test/run_benchmarks_integration_test.rb b/test/run_benchmarks_integration_test.rb
new file mode 100644
index 00000000..5142e069
--- /dev/null
+++ b/test/run_benchmarks_integration_test.rb
@@ -0,0 +1,123 @@
+require_relative 'test_helper'
+require 'shellwords'
+require 'tmpdir'
+require 'fileutils'
+
+describe 'run_benchmarks.rb integration' do
+  before do
+    @script_path = File.expand_path('../run_benchmarks.rb', __dir__)
+    @ruby_path = RbConfig.ruby
+  end
+
+  describe 'command-line parsing' do
+    it 'shows help with --help flag' do
+      skip 'Skipping integration test - requires full setup'
+    end
+
+    it 'handles --once flag' do
+      Dir.mktmpdir do |tmpdir|
+        # This would set ENV["WARMUP_ITRS"] = "0" and ENV["MIN_BENCH_ITRS"] = "1"
+        cmd = "#{@ruby_path} #{@script_path} --once --name_filters=fib --out_path=#{tmpdir} 2>&1"
+        result = `#{cmd}`
+
+        # Should run but may fail due to missing benchmarks - that's okay
+        # We're just checking the script can parse arguments
+        skip 'Requires benchmark environment' unless $?.success? || result.include?('Running benchmark')
+      end
+    end
+  end
+
+  describe 'output files' do
+    it 'creates output files with correct naming convention' do
+      Dir.mktmpdir do |tmpdir|
+        # Create some mock output files
+        File.write(File.join(tmpdir, 'output_001.csv'), 'test')
+        File.write(File.join(tmpdir, 'output_002.csv'), 'test')
+
+        # The free_file_no function should find the next number
+        require_relative '../lib/benchmark_runner'
+        file_no = BenchmarkRunner.free_file_no(tmpdir)
+        assert_equal 3, file_no
+      end
+    end
+
+    it 'uses correct output file format' do
+      file_no = 42
+      expected = 'output_042.csv'
+      actual = 'output_%03d.csv' % file_no
+      assert_equal expected, actual
+    end
+  end
+
+  describe 'benchmark metadata' do
+    before do
+      @benchmarks_yml = File.expand_path('../benchmarks.yml', __dir__)
+    end
+
+    it 'benchmarks.yml exists' do
+      assert_equal true, File.exist?(@benchmarks_yml)
+    end
+
+    it 'benchmarks.yml is valid YAML' do
+      require 'yaml'
+      data = YAML.load_file(@benchmarks_yml)
+      assert_instance_of Hash, data
+    end
+
+    it 'benchmarks.yml has valid category values' do
+      require 'yaml'
+      data = YAML.load_file(@benchmarks_yml)
+      valid_categories = ['headline', 'micro', 'other']
+
+      data.each do |name, metadata|
+        if metadata['category']
+          assert_includes valid_categories, metadata['category'],
+            "Benchmark '#{name}' has invalid category: #{metadata['category']}"
+        end
+      end
+    end
+  end
+
+  describe 'script structure' do
+    it 'run_benchmarks.rb is executable' do
+      assert_equal true, File.executable?(@script_path)
+    end
+
+    it 'run_benchmarks.rb has shebang' do
+      first_line = File.open(@script_path, &:readline)
+      assert_match(/^#!.*ruby/, first_line)
+    end
+
+    it 'loads required dependencies' do
+      # Check that the script loads without errors (syntax check)
+      cmd = "#{@ruby_path} -c #{@script_path}"
+      result = `#{cmd} 2>&1`
+      assert_includes result, 'Syntax OK'
+    end
+  end
+
+  describe 'environment handling' do
+    it 'handles WARMUP_ITRS environment variable' do
+      # The script should read ENV["WARMUP_ITRS"]
+      script_content = File.read(@script_path)
+      assert_includes script_content, 'WARMUP_ITRS'
+    end
+
+    it 'handles MIN_BENCH_ITRS environment variable' do
+      script_content = File.read(@script_path)
+      assert_includes script_content, 'MIN_BENCH_ITRS'
+    end
+  end
+
+  describe 'stats module integration' do
+    it 'uses Stats class for calculations' do
+      require_relative '../misc/stats'
+
+      values = [1, 2, 3, 4, 5]
+      stats = Stats.new(values)
+
+      assert_equal 3.0, stats.mean
+      assert_in_delta 1.414, stats.stddev, 0.01
+    end
+  end
+end

From 9b3a0088dce46ff71c984bfe5c50c63e57e1b38a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 23:10:11 +0000
Subject: [PATCH 2/6] Extract table formatter and add tests

---
 lib/benchmark_runner.rb       |  36 ----------
 lib/table_formatter.rb        |  77 ++++++++++++++++++++++
 run_benchmarks.rb             |   7 +-
 test/benchmark_runner_test.rb |  56 ----------------
 test/table_formatter_test.rb  | 119 ++++++++++++++++++++++++++++++++++
 5 files changed, 198 insertions(+), 97 deletions(-)
 create mode 100644 lib/table_formatter.rb
 create mode 100644 test/table_formatter_test.rb

diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
index 6e350e86..69cce2a9 100644
--- a/lib/benchmark_runner.rb
+++ b/lib/benchmark_runner.rb
@@ -8,42 +8,6 @@
 module BenchmarkRunner
   module_function
 
-  # Format benchmark data as a string table
-  def table_to_str(table_data, format, failures)
-    # Trim numbers to one decimal for console display
-    # Keep two decimals for the speedup ratios
-
-    failure_rows = failures.map { |_exe, data| data.keys }.flatten.uniq
-                           .map { |name| [name] + (['N/A'] * (table_data.first.size - 1)) }
-
-    table_data = table_data.first(1) + failure_rows + table_data.drop(1).map { |row|
-      format.zip(row).map { |fmt, data| fmt % data }
-    }
-
-    num_rows = table_data.length
-    num_cols = table_data[0].length
-
-    # Pad each column to the maximum width in the column
-    (0...num_cols).each do |c|
-      cell_lens = (0...num_rows).map { |r| table_data[r][c].length }
-      max_width = cell_lens.max
-      (0...num_rows).each { |r| table_data[r][c] = table_data[r][c].ljust(max_width) }
-    end
-
-    # Row of separator dashes
-    sep_row = (0...num_cols).map { |i| '-' * table_data[0][i].length }.join('  ').rstrip
-
-    out = sep_row + "\n"
-
-    table_data.each do |row|
-      out += row.join('  ').rstrip + "\n"
-    end
-
-    out += sep_row + "\n"
-
-    out
-  end
-
   # Find the first available file number for output files
   def free_file_no(prefix)
     (1..).each do |file_no|
diff --git a/lib/table_formatter.rb b/lib/table_formatter.rb
new file mode 100644
index 00000000..325858ea
--- /dev/null
+++ b/lib/table_formatter.rb
@@ -0,0 +1,77 @@
+# frozen_string_literal: true
+
+# Formats benchmark data as an ASCII table with aligned columns
+class TableFormatter
+  COLUMN_SEPARATOR = '  '
+  FAILURE_PLACEHOLDER = 'N/A'
+
+  def initialize(table_data, format, failures)
+    @header = table_data.first
+    @data_rows = table_data.drop(1)
+    @format = format
+    @failures = failures
+    @num_columns = @header.size
+  end
+
+  def to_s
+    rows = build_all_rows
+    col_widths = calculate_column_widths(rows)
+
+    format_table(rows, col_widths)
+  end
+
+  private
+
+  attr_reader :num_columns
+
+  def build_all_rows
+    [@header, *build_failure_rows, *build_formatted_data_rows]
+  end
+
+  def build_failure_rows
+    return [] if @failures.empty?
+
+    failed_benchmarks = extract_failed_benchmarks
+    failed_benchmarks.map { |name| build_failure_row(name) }
+  end
+
+  def extract_failed_benchmarks
+    @failures.flat_map { |_exe, data| data.keys }.uniq
+  end
+
+  def build_failure_row(benchmark_name)
+    [benchmark_name, *Array.new(num_columns - 1, FAILURE_PLACEHOLDER)]
+  end
+
+  def build_formatted_data_rows
+    @data_rows.map { |row| apply_format(row) }
+  end
+
+  def apply_format(row)
+    @format.zip(row).map { |fmt, data| fmt % data }
+  end
+
+  def calculate_column_widths(rows)
+    (0...num_columns).map do |col_index|
+      rows.map { |row| row[col_index].length }.max
+    end
+  end
+
+  def format_table(rows, col_widths)
+    separator = build_separator(col_widths)
+
+    formatted_rows = rows.map { |row| format_row(row, col_widths) }
+
+    [separator, *formatted_rows, separator].join("\n") + "\n"
+  end
+
+  def build_separator(col_widths)
+    col_widths.map { |width| '-' * width }.join(COLUMN_SEPARATOR)
+  end
+
+  def format_row(row, col_widths)
+    row.map.with_index { |cell, i| cell.ljust(col_widths[i]) }
+       .join(COLUMN_SEPARATOR)
+       .rstrip
+  end
+end
diff --git a/run_benchmarks.rb b/run_benchmarks.rb
index 2e0b5bbc..c434bc8d 100755
--- a/run_benchmarks.rb
+++ b/run_benchmarks.rb
@@ -12,6 +12,7 @@
 require 'yaml'
 require_relative 'misc/stats'
 require_relative 'lib/benchmark_runner'
+require_relative 'lib/table_formatter'
 
 # Check which OS we are running
 def os
@@ -109,10 +110,6 @@ def performance_governor?
   end
 end
 
-def table_to_str(table_data, format, failures)
-  BenchmarkRunner.table_to_str(table_data, format, failures)
-end
-
 def mean(values)
   Stats.new(values).mean
 end
@@ -525,7 +522,7 @@ def run_benchmarks(ruby:, ruby_description:, categories:, name_filters:, out_pat
   output_str << "#{key}: #{value}\n"
 end
 output_str += "\n"
-output_str += table_to_str(table, format, bench_failures) + "\n"
+output_str += TableFormatter.new(table, format, bench_failures).to_s + "\n"
 unless other_names.empty?
   output_str << "Legend:\n"
   other_names.each do |name|
diff --git a/test/benchmark_runner_test.rb b/test/benchmark_runner_test.rb
index a9265801..40a2ea4d 100644
--- a/test/benchmark_runner_test.rb
+++ b/test/benchmark_runner_test.rb
@@ -9,62 +9,6 @@
 require 'yaml'
 
 describe BenchmarkRunner do
-  describe '.table_to_str' do
-    it 'formats a simple table correctly' do
-      table_data = [
-        ['bench', 'time (ms)', 'stddev (%)'],
-        ['fib', '100.5', '2.3'],
-        ['loop', '50.2', '1.1']
-      ]
-      format = ['%s', '%s', '%s']
-      failures = {}
-
-      result = BenchmarkRunner.table_to_str(table_data, format, failures)
-
-      assert_equal <<~TABLE, result
-        -----  ---------  ----------
-        bench  time (ms)  stddev (%)
-        fib    100.5      2.3
-        loop   50.2       1.1
-        -----  ---------  ----------
-      TABLE
-    end
-
-    it 'includes failure rows when failures are present' do
-      table_data = [
-        ['bench', 'time (ms)'],
-        ['fib', '100.5']
-      ]
-      format = ['%s', '%s']
-      failures = { 'ruby' => { 'broken_bench' => 1 } }
-
-      result = BenchmarkRunner.table_to_str(table_data, format, failures)
-
-      assert_equal <<~TABLE, result
-        ------------  ---------
-        bench         time (ms)
-        broken_bench  N/A
-        fib           100.5
-        ------------  ---------
-      TABLE
-    end
-
-    it 'handles empty failures hash' do
-      table_data = [['bench'], ['fib']]
-      format = ['%s']
-      failures = {}
-
-      result = BenchmarkRunner.table_to_str(table_data, format, failures)
-
-      assert_equal <<~TABLE, result
-        -----
-        bench
-        fib
-        -----
-      TABLE
-    end
-  end
-
   describe '.free_file_no' do
     it 'returns 1 when no files exist' do
       Dir.mktmpdir do |dir|
diff --git a/test/table_formatter_test.rb b/test/table_formatter_test.rb
new file mode 100644
index 00000000..561af940
--- /dev/null
+++ b/test/table_formatter_test.rb
@@ -0,0 +1,119 @@
+require_relative 'test_helper'
+require_relative '../lib/table_formatter'
+
+describe TableFormatter do
+  describe '#to_s' do
+    it 'formats a simple table correctly' do
+      table_data = [
+        ['bench', 'time (ms)', 'stddev (%)'],
+        ['fib', '100.5', '2.3'],
+        ['loop', '50.2', '1.1']
+      ]
+      format = ['%s', '%s', '%s']
+      failures = {}
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+
+      assert_equal <<~TABLE, result
+        -----  ---------  ----------
+        bench  time (ms)  stddev (%)
+        fib    100.5      2.3
+        loop   50.2       1.1
+        -----  ---------  ----------
+      TABLE
+    end
+
+    it 'includes failure rows when failures are present' do
+      table_data = [
+        ['bench', 'time (ms)'],
+        ['fib', '100.5']
+      ]
+      format = ['%s', '%s']
+      failures = { 'ruby' => { 'broken_bench' => 1 } }
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+
+      assert_equal <<~TABLE, result
+        ------------  ---------
+        bench         time (ms)
+        broken_bench  N/A
+        fib           100.5
+        ------------  ---------
+      TABLE
+    end
+
+    it 'handles empty failures hash' do
+      table_data = [['bench'], ['fib']]
+      format = ['%s']
+      failures = {}
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+
+      assert_equal <<~TABLE, result
+        -----
+        bench
+        fib
+        -----
+      TABLE
+    end
+
+    it 'handles empty failures hash' do
+      table_data = [['bench'], ['fib']]
+      format = ['%s']
+      failures = {}
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+      refute_includes result, 'N/A'
+    end
+
+    it 'handles multiple failures from different executables' do
+      table_data = [
+        ['bench', 'time (ms)'],
+        ['fib', '100.5']
+      ]
+      format = ['%s', '%s']
+      failures = {
+        'ruby' => { 'broken_bench' => 1 },
+        'ruby-yjit' => { 'another_broken' => 1 }
+      }
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+
+      assert_includes result, 'broken_bench'
+      assert_includes result, 'another_broken'
+      # Count N/A occurrences - should have 2 (one for each failed benchmark)
+      assert_equal 2, result.scan(/N\/A/).count
+    end
+
+    it 'removes trailing spaces from last column' do
+      table_data = [
+        ['bench', 'time (ms)', 'stddev (%)'],
+        ['fib', '100.5', '2.3']
+      ]
+      format = ['%s', '%s', '%s']
+      failures = {}
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+      lines = result.lines
+
+      # No line should have trailing spaces before the newline
+      lines.each do |line|
+        refute_match(/ \n\z/, line, "Line should not have trailing spaces: #{line.inspect}")
+      end
+    end
+
+    it 'applies format strings correctly' do
+      table_data = [
+        ['bench', 'time'],
+        ['fib', 123.456]
+      ]
+      format = ['%s', '%.1f']
+      failures = {}
+
+      result = TableFormatter.new(table_data, format, failures).to_s
+
+      assert_includes result, '123.5' # Should round to 1 decimal
+      refute_includes result, '123.456'
+    end
+  end
+end

From 2ee9831897e3dedd82d39e20351ea809e2cb8693 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 23:10:29 +0000
Subject: [PATCH 3/6] Run tests on CI

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 69a64437..a2a3633d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -27,7 +27,7 @@ jobs:
           ruby-version: ${{ matrix.ruby }}
 
       - name: Run tests
-        run: ruby test/benchmarks_test.rb
+        run: rake test
 
       - name: Test run_benchmarks.rb
         run: ./run_benchmarks.rb

From 90e156a375be92cf9f4c35747046aa94ace09eab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 23:14:46 +0000
Subject: [PATCH 4/6] Rename argument to make it clearer

---
 lib/benchmark_runner.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
index 69cce2a9..33459676 100644
--- a/lib/benchmark_runner.rb
+++ b/lib/benchmark_runner.rb
@@ -9,9 +9,9 @@ module BenchmarkRunner
   module_function
 
   # Find the first available file number for output files
-  def free_file_no(prefix)
+  def free_file_no(directory)
     (1..).each do |file_no|
-      out_path = File.join(prefix, "output_%03d.csv" % file_no)
+      out_path = File.join(directory, "output_%03d.csv" % file_no)
       return file_no unless File.exist?(out_path)
     end
   end

From 3a4cb10692e1d5cc8ba5fab6f1c6be30519130d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 23:16:08 +0000
Subject: [PATCH 5/6] Remove unneeded os method and use BenchmarkRunner.os
 directly

---
 run_benchmarks.rb | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/run_benchmarks.rb b/run_benchmarks.rb
index c434bc8d..cf6aabb1 100755
--- a/run_benchmarks.rb
+++ b/run_benchmarks.rb
@@ -14,11 +14,6 @@
 require_relative 'lib/benchmark_runner'
 require_relative 'lib/table_formatter'
 
-# Check which OS we are running
-def os
-  BenchmarkRunner.os
-end
-
 # Checked system - error or return info if the command fails
 def check_call(command, env: {}, raise_error: true, quiet: false)
   puts("+ #{command}") unless quiet
@@ -189,7 +184,7 @@ def run_benchmarks(ruby:, ruby_description:, categories:, name_filters:, out_pat
 
       # Set up the benchmarking command
       cmd = []
-      if os == :linux
+      if BenchmarkRunner.os == :linux
         cmd += setarch_prefix
 
         # Pin the process to one given core to improve caching and reduce variance on CRuby

From 76750e89095dfc2163a7b6645130f73ef9a086d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafael=20Mendon=C3=A7a=20Fran=C3=A7a?=
 <rafael.franca@shopify.com>
Date: Fri, 14 Nov 2025 23:33:46 +0000
Subject: [PATCH 6/6] Use hash lookup instead of find to get benchmark metadata

---
 lib/benchmark_runner.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/benchmark_runner.rb b/lib/benchmark_runner.rb
index 33459676..c69423b7 100644
--- a/lib/benchmark_runner.rb
+++ b/lib/benchmark_runner.rb
@@ -18,7 +18,7 @@ def free_file_no(directory)
 
   # Get benchmark categories from metadata
   def benchmark_categories(name, metadata)
-    benchmark_metadata = metadata.find { |benchmark, _metadata| benchmark == name }&.last || {}
+    benchmark_metadata = metadata[name] || {}
     categories = [benchmark_metadata.fetch('category', 'other')]
     categories << 'ractor' if benchmark_metadata['ractor']
     categories