diff --git a/src/backtrack.rs b/src/backtrack.rs
index 6100c1730..c1dff8499 100644
--- a/src/backtrack.rs
+++ b/src/backtrack.rs
@@ -18,7 +18,7 @@
 
 use exec::ProgramCache;
 use input::{Input, InputAt};
-use prog::{InstPtr, Program};
+use prog::{BytesInst, InstPtr, InstTrait, Program, UnicodeInst};
 use re_trait::Slot;
 
 type Bits = u32;
@@ -41,8 +41,8 @@ pub fn should_exec(num_insts: usize, text_len: usize) -> bool {
 
 /// A backtracking matching engine.
 #[derive(Debug)]
-pub struct Bounded<'a, 'm, 'r, 's, I> {
-    prog: &'r Program,
+pub struct Bounded<'a, 'm, 'r, 's, I, P: InstTrait> {
+    prog: &'r Program<P>,
     input: I,
     matches: &'m mut [bool],
     slots: &'s mut [Slot],
@@ -59,7 +59,7 @@ pub struct Cache {
 
 impl Cache {
     /// Create new empty cache for the backtracking engine.
-    pub fn new(_prog: &Program) -> Self {
+    pub fn new<I: InstTrait>(_prog: &Program<I>) -> Self {
         Cache { jobs: vec![], visited: vec![] }
     }
 }
@@ -76,13 +76,15 @@ enum Job {
     SaveRestore { slot: usize, old_pos: Option<usize> },
 }
 
-impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
+impl<'a, 'm, 'r, 's, I: Input, P: InstTrait + Step>
+    Bounded<'a, 'm, 'r, 's, I, P>
+{
     /// Execute the backtracking matching engine.
     ///
     /// If there's a match, `exec` returns `true` and populates the given
     /// captures accordingly.
     pub fn exec(
-        prog: &'r Program,
+        prog: &'r Program<P>,
         cache: &ProgramCache,
         matches: &'m mut [bool],
         slots: &'s mut [Slot],
@@ -93,14 +95,14 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
         let mut cache = cache.borrow_mut();
         let cache = &mut cache.backtrack;
         let start = input.at(start);
-        let mut b = Bounded {
+        Bounded {
             prog: prog,
             input: input,
             matches: matches,
             slots: slots,
             m: cache,
-        };
-        b.exec_(start, end)
+        }
+        .exec_(start, end)
     }
 
     /// Clears the cache such that the backtracking engine can be executed
@@ -196,7 +198,6 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
     }
 
     fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool {
-        use prog::Inst::*;
         loop {
             // This loop is an optimization to avoid constantly pushing/popping
             // from the stack. Namely, if we're pushing a job only to run it
@@ -205,64 +206,12 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
             if self.has_visited(ip, at) {
                 return false;
             }
-            match self.prog[ip] {
-                Match(slot) => {
-                    if slot < self.matches.len() {
-                        self.matches[slot] = true;
-                    }
-                    return true;
-                }
-                Save(ref inst) => {
-                    if let Some(&old_pos) = self.slots.get(inst.slot) {
-                        // If this path doesn't work out, then we save the old
-                        // capture index (if one exists) in an alternate
-                        // job. If the next path fails, then the alternate
-                        // job is popped and the old capture index is restored.
-                        self.m.jobs.push(Job::SaveRestore {
-                            slot: inst.slot,
-                            old_pos: old_pos,
-                        });
-                        self.slots[inst.slot] = Some(at.pos());
-                    }
-                    ip = inst.goto;
-                }
-                Split(ref inst) => {
-                    self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
-                    ip = inst.goto1;
-                }
-                EmptyLook(ref inst) => {
-                    if self.input.is_empty_match(at, inst) {
-                        ip = inst.goto;
-                    } else {
-                        return false;
-                    }
-                }
-                Char(ref inst) => {
-                    if inst.c == at.char() {
-                        ip = inst.goto;
-                        at = self.input.at(at.next_pos());
-                    } else {
-                        return false;
-                    }
-                }
-                Ranges(ref inst) => {
-                    if inst.matches(at.char()) {
-                        ip = inst.goto;
-                        at = self.input.at(at.next_pos());
-                    } else {
-                        return false;
-                    }
-                }
-                Bytes(ref inst) => {
-                    if let Some(b) = at.byte() {
-                        if inst.matches(b) {
-                            ip = inst.goto;
-                            at = self.input.at(at.next_pos());
-                            continue;
-                        }
-                    }
-                    return false;
+            match self.prog[ip].step(self, at) {
+                Ok((next_ip, next_at)) => {
+                    ip = next_ip;
+                    at = next_at;
                 }
+                Err(res) => return res,
             }
         }
     }
@@ -280,6 +229,125 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> {
     }
 }
 
+pub trait Step: InstTrait + Sized {
+    fn step<I: Input>(
+        &self,
+        bounded: &mut Bounded<'_, '_, '_, '_, I, Self>,
+        at: InputAt,
+    ) -> Result<(InstPtr, InputAt), bool>;
+}
+
+impl Step for UnicodeInst {
+    fn step<I: Input>(
+        &self,
+        bounded: &mut Bounded<'_, '_, '_, '_, I, Self>,
+        mut at: InputAt,
+    ) -> Result<(InstPtr, InputAt), bool> {
+        use prog::UnicodeInst::*;
+        match *self {
+            Match(slot) => {
+                if slot < bounded.matches.len() {
+                    bounded.matches[slot] = true;
+                }
+                Err(true)
+            }
+            Save(ref inst) => {
+                if let Some(&old_pos) = bounded.slots.get(inst.slot) {
+                    // If this path doesn't work out, then we save the old
+                    // capture index (if one exists) in an alternate
+                    // job. If the next path fails, then the alternate
+                    // job is popped and the old capture index is restored.
+                    bounded.m.jobs.push(Job::SaveRestore {
+                        slot: inst.slot,
+                        old_pos: old_pos,
+                    });
+                    bounded.slots[inst.slot] = Some(at.pos());
+                }
+                Ok((inst.goto, at))
+            }
+            Split(ref inst) => {
+                bounded.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
+                Ok((inst.goto1, at))
+            }
+            EmptyLook(ref inst) => {
+                if bounded.input.is_empty_match(at, inst) {
+                    Ok((inst.goto, at))
+                } else {
+                    Err(false)
+                }
+            }
+            Char(ref inst) => {
+                if inst.c == at.char() {
+                    at = bounded.input.at(at.next_pos());
+                    Ok((inst.goto, at))
+                } else {
+                    Err(false)
+                }
+            }
+            Ranges(ref inst) => {
+                if inst.matches(at.char()) {
+                    at = bounded.input.at(at.next_pos());
+                    Ok((inst.goto, at))
+                } else {
+                    Err(false)
+                }
+            }
+        }
+    }
+}
+
+impl Step for BytesInst {
+    fn step<I: Input>(
+        &self,
+        bounded: &mut Bounded<'_, '_, '_, '_, I, Self>,
+        mut at: InputAt,
+    ) -> Result<(InstPtr, InputAt), bool> {
+        use prog::BytesInst::*;
+        match *self {
+            Match(slot) => {
+                if slot < bounded.matches.len() {
+                    bounded.matches[slot] = true;
+                }
+                Err(true)
+            }
+            Save(ref inst) => {
+                if let Some(&old_pos) = bounded.slots.get(inst.slot) {
+                    // If this path doesn't work out, then we save the old
+                    // capture index (if one exists) in an alternate
+                    // job. If the next path fails, then the alternate
+                    // job is popped and the old capture index is restored.
+                    bounded.m.jobs.push(Job::SaveRestore {
+                        slot: inst.slot,
+                        old_pos: old_pos,
+                    });
+                    bounded.slots[inst.slot] = Some(at.pos());
+                }
+                Ok((inst.goto, at))
+            }
+            Split(ref inst) => {
+                bounded.m.jobs.push(Job::Inst { ip: inst.goto2, at: at });
+                Ok((inst.goto1, at))
+            }
+            EmptyLook(ref inst) => {
+                if bounded.input.is_empty_match(at, inst) {
+                    Ok((inst.goto, at))
+                } else {
+                    Err(false)
+                }
+            }
+            Bytes(ref inst) => {
+                if let Some(b) = at.byte() {
+                    if inst.matches(b) {
+                        at = bounded.input.at(at.next_pos());
+                        return Ok((inst.goto, at));
+                    }
+                }
+                Err(false)
+            }
+        }
+    }
+}
+
 fn usize_to_u32(n: usize) -> u32 {
     if (n as u64) > (::std::u32::MAX as u64) {
         panic!("BUG: {} is too big to fit into u32", n)
diff --git a/src/compile.rs b/src/compile.rs
index 9ffd34704..08d9b6849 100644
--- a/src/compile.rs
+++ b/src/compile.rs
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 use std::fmt;
 use std::iter;
+use std::mem;
 use std::result;
 use std::sync::Arc;
 
@@ -9,8 +10,8 @@ use syntax::is_word_byte;
 use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences};
 
 use prog::{
-    EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges,
-    InstSave, InstSplit, Program,
+    BytesInst, EmptyLook, InstBytes, InstChar, InstEmptyLook, InstPtr,
+    InstRanges, InstSave, InstSplit, InstTrait, Program, UnicodeInst,
 };
 
 use Error;
@@ -29,9 +30,9 @@ struct Patch {
 // `Compiler` is only public via the `internal` module, so avoid deriving
 // `Debug`.
 #[allow(missing_debug_implementations)]
-pub struct Compiler {
-    insts: Vec<MaybeInst>,
-    compiled: Program,
+pub struct Compiler<I: InstTrait> {
+    insts: Vec<MaybeInst<I>>,
+    compiled: Program<I>,
     capture_name_idx: HashMap<String, usize>,
     num_exprs: usize,
     size_limit: usize,
@@ -40,7 +41,7 @@ pub struct Compiler {
     byte_classes: ByteClassSet,
 }
 
-impl Compiler {
+impl<I: InstTrait> Compiler<I> {
     /// Create a new regular expression compiler.
     ///
     /// Various options can be set before calling `compile` on an expression.
@@ -65,22 +66,6 @@ impl Compiler {
         self
     }
 
-    /// If bytes is true, then the program is compiled as a byte based
-    /// automaton, which incorporates UTF-8 decoding into the machine. If it's
-    /// false, then the automaton is Unicode scalar value based, e.g., an
-    /// engine utilizing such an automaton is responsible for UTF-8 decoding.
-    ///
-    /// The specific invariant is that when returning a byte based machine,
-    /// the neither the `Char` nor `Ranges` instructions are produced.
-    /// Conversely, when producing a Unicode scalar value machine, the `Bytes`
-    /// instruction is never produced.
-    ///
-    /// Note that `dfa(true)` implies `bytes(true)`.
-    pub fn bytes(mut self, yes: bool) -> Self {
-        self.compiled.is_bytes = yes;
-        self
-    }
-
     /// When disabled, the program compiled may match arbitrary bytes.
     ///
     /// When enabled (the default), all compiled programs exclusively match
@@ -108,13 +93,18 @@ impl Compiler {
         self.compiled.is_reverse = yes;
         self
     }
+}
 
+impl<I: InstTrait + From<(InstHole, usize)>> Compiler<I> {
     /// Compile a regular expression given its AST.
     ///
     /// The compiler is guaranteed to succeed unless the program exceeds the
     /// specified size limit. If the size limit is exceeded, then compilation
     /// stops and returns an error.
-    pub fn compile(mut self, exprs: &[Hir]) -> result::Result<Program, Error> {
+    pub fn compile(
+        mut self,
+        exprs: &[Hir],
+    ) -> result::Result<Program<I>, Error> {
         debug_assert!(!exprs.is_empty());
         self.num_exprs = exprs.len();
         if exprs.len() == 1 {
@@ -124,7 +114,7 @@ impl Compiler {
         }
     }
 
-    fn compile_one(mut self, expr: &Hir) -> result::Result<Program, Error> {
+    fn compile_one(mut self, expr: &Hir) -> result::Result<Program<I>, Error> {
         // If we're compiling a forward DFA and we aren't anchored, then
         // add a `.*?` before the first capture group.
         // Other matching engines handle this by baking the logic into the
@@ -145,14 +135,14 @@ impl Compiler {
         }
         self.fill_to_next(patch.hole);
         self.compiled.matches = vec![self.insts.len()];
-        self.push_compiled(Inst::Match(0));
+        self.push_compiled(I::new_match(0));
         self.compile_finish()
     }
 
     fn compile_many(
         mut self,
         exprs: &[Hir],
-    ) -> result::Result<Program, Error> {
+    ) -> result::Result<Program<I>, Error> {
         debug_assert!(exprs.len() > 1);
 
         self.compiled.is_anchored_start =
@@ -176,7 +166,7 @@ impl Compiler {
                 self.c_capture(0, expr)?.unwrap_or(self.next_inst());
             self.fill_to_next(hole);
             self.compiled.matches.push(self.insts.len());
-            self.push_compiled(Inst::Match(i));
+            self.push_compiled(I::new_match(i));
             prev_hole = self.fill_split(split, Some(entry), None);
         }
         let i = exprs.len() - 1;
@@ -185,11 +175,11 @@ impl Compiler {
         self.fill(prev_hole, entry);
         self.fill_to_next(hole);
         self.compiled.matches.push(self.insts.len());
-        self.push_compiled(Inst::Match(i));
+        self.push_compiled(I::new_match(i));
         self.compile_finish()
     }
 
-    fn compile_finish(mut self) -> result::Result<Program, Error> {
+    fn compile_finish(mut self) -> result::Result<Program<I>, Error> {
         self.compiled.insts =
             self.insts.into_iter().map(|inst| inst.unwrap()).collect();
         self.compiled.byte_classes = self.byte_classes.byte_classes();
@@ -474,9 +464,9 @@ impl Compiler {
         Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 }))
     }
 
-    fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty
+    fn c_concat<'a, E>(&mut self, exprs: E) -> ResultOrEmpty
     where
-        I: IntoIterator<Item = &'a Hir>,
+        E: IntoIterator<Item = &'a Hir>,
     {
         let mut exprs = exprs.into_iter();
         let Patch { mut hole, entry } = loop {
@@ -771,7 +761,7 @@ impl Compiler {
         }
     }
 
-    fn push_compiled(&mut self, inst: Inst) {
+    fn push_compiled(&mut self, inst: I) {
         self.insts.push(MaybeInst::Compiled(inst));
     }
 
@@ -795,7 +785,7 @@ impl Compiler {
     fn check_size(&self) -> result::Result<(), Error> {
         use std::mem::size_of;
 
-        if self.insts.len() * size_of::<Inst>() > self.size_limit {
+        if self.insts.len() * size_of::<I>() > self.size_limit {
             Err(Error::CompiledTooBig(self.size_limit))
         } else {
             Ok(())
@@ -822,29 +812,31 @@ impl Hole {
 }
 
 #[derive(Clone, Debug)]
-enum MaybeInst {
-    Compiled(Inst),
+enum MaybeInst<I> {
+    Compiled(I),
     Uncompiled(InstHole),
     Split,
     Split1(InstPtr),
     Split2(InstPtr),
 }
 
-impl MaybeInst {
+impl<I: InstTrait + From<(InstHole, usize)>> MaybeInst<I> {
     fn fill(&mut self, goto: InstPtr) {
         let maybeinst = match *self {
             MaybeInst::Split => MaybeInst::Split1(goto),
-            MaybeInst::Uncompiled(ref inst) => {
-                MaybeInst::Compiled(inst.fill(goto))
+            MaybeInst::Uncompiled(ref mut inst) => {
+                // Replace by dummy `InstHole`
+                let inst = mem::replace(inst, InstHole::Save { slot: 0 });
+                MaybeInst::Compiled((inst, goto).into())
             }
             MaybeInst::Split1(goto1) => {
-                MaybeInst::Compiled(Inst::Split(InstSplit {
+                MaybeInst::Compiled(I::new_split(InstSplit {
                     goto1: goto1,
                     goto2: goto,
                 }))
             }
             MaybeInst::Split2(goto2) => {
-                MaybeInst::Compiled(Inst::Split(InstSplit {
+                MaybeInst::Compiled(I::new_split(InstSplit {
                     goto1: goto,
                     goto2: goto2,
                 }))
@@ -861,7 +853,7 @@ impl MaybeInst {
     fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) {
         let filled = match *self {
             MaybeInst::Split => {
-                Inst::Split(InstSplit { goto1: goto1, goto2: goto2 })
+                I::new_split(InstSplit { goto1: goto1, goto2: goto2 })
             }
             _ => unreachable!(
                 "must be called on Split instruction, \
@@ -896,7 +888,7 @@ impl MaybeInst {
         *self = MaybeInst::Split2(half_filled);
     }
 
-    fn unwrap(self) -> Inst {
+    fn unwrap(self) -> I {
         match self {
             MaybeInst::Compiled(inst) => inst,
             _ => unreachable!(
@@ -908,8 +900,10 @@ impl MaybeInst {
     }
 }
 
+// TODO: Specialize `compile` into `compile_bytes` and `compile_unicode`
+// to avoid making `InstHole` public?
 #[derive(Clone, Debug)]
-enum InstHole {
+pub enum InstHole {
     Save { slot: usize },
     EmptyLook { look: EmptyLook },
     Char { c: char },
@@ -917,32 +911,60 @@ enum InstHole {
     Bytes { start: u8, end: u8 },
 }
 
-impl InstHole {
-    fn fill(&self, goto: InstPtr) -> Inst {
-        match *self {
+impl From<(InstHole, InstPtr)> for UnicodeInst {
+    fn from(val: (InstHole, InstPtr)) -> UnicodeInst {
+        let (hole, goto) = val;
+        match hole {
             InstHole::Save { slot } => {
-                Inst::Save(InstSave { goto: goto, slot: slot })
+                UnicodeInst::Save(InstSave { goto: goto, slot: slot })
             }
             InstHole::EmptyLook { look } => {
-                Inst::EmptyLook(InstEmptyLook { goto: goto, look: look })
+                UnicodeInst::EmptyLook(InstEmptyLook {
+                    goto: goto,
+                    look: look,
+                })
+            }
+            InstHole::Char { c } => {
+                UnicodeInst::Char(InstChar { goto: goto, c: c })
             }
-            InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }),
             InstHole::Ranges { ref ranges } => {
-                Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() })
+                UnicodeInst::Ranges(InstRanges {
+                    goto: goto,
+                    ranges: ranges.clone(),
+                })
             }
-            InstHole::Bytes { start, end } => {
-                Inst::Bytes(InstBytes { goto: goto, start: start, end: end })
+            InstHole::Bytes { .. } => unreachable!(),
+        }
+    }
+}
+
+impl From<(InstHole, InstPtr)> for BytesInst {
+    fn from(val: (InstHole, InstPtr)) -> BytesInst {
+        let (hole, goto) = val;
+        match hole {
+            InstHole::Save { slot } => {
+                BytesInst::Save(InstSave { goto: goto, slot: slot })
             }
+            InstHole::EmptyLook { look } => {
+                BytesInst::EmptyLook(InstEmptyLook { goto: goto, look: look })
+            }
+            InstHole::Char { .. } => unreachable!(),
+            InstHole::Ranges { .. } => unreachable!(),
+            InstHole::Bytes { start, end } => BytesInst::Bytes(InstBytes {
+                goto: goto,
+                start: start,
+                end: end,
+            }),
         }
     }
 }
 
-struct CompileClass<'a, 'b> {
-    c: &'a mut Compiler,
+struct CompileClass<'a, 'b, I: InstTrait> {
+    c: &'a mut Compiler<I>,
     ranges: &'b [hir::ClassUnicodeRange],
 }
 
-impl<'a, 'b> CompileClass<'a, 'b> {
+impl<'a, 'b, I: InstTrait + From<(InstHole, usize)>> CompileClass<'a, 'b, I> {
     fn compile(mut self) -> Result {
         let mut holes = vec![];
         let mut initial_entry = None;
@@ -992,9 +1014,9 @@ impl<'a, 'b> CompileClass<'a, 'b> {
         }
     }
 
-    fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result
+    fn c_utf8_seq_<'r, S>(&mut self, seq: S) -> Result
     where
-        I: IntoIterator<Item = &'r Utf8Range>,
+        S: IntoIterator<Item = &'r Utf8Range>,
     {
         // The initial instruction for each UTF-8 sequence should be the same.
         let mut from_inst = ::std::usize::MAX;
@@ -1013,17 +1035,14 @@ impl<'a, 'b> CompileClass<'a, 'b> {
                 }
             }
             self.c.byte_classes.set_range(byte_range.start, byte_range.end);
+            let inst_hole = InstHole::Bytes {
+                start: byte_range.start,
+                end: byte_range.end,
+            };
             if from_inst == ::std::usize::MAX {
-                last_hole = self.c.push_hole(InstHole::Bytes {
-                    start: byte_range.start,
-                    end: byte_range.end,
-                });
+                last_hole = self.c.push_hole(inst_hole);
             } else {
-                self.c.push_compiled(Inst::Bytes(InstBytes {
-                    goto: from_inst,
-                    start: byte_range.start,
-                    end: byte_range.end,
-                }));
+                self.c.push_compiled((inst_hole, from_inst).into());
             }
             from_inst = self.c.insts.len().checked_sub(1).unwrap();
             debug_assert!(from_inst < ::std::usize::MAX);
diff --git a/src/dfa.rs b/src/dfa.rs
index 9ac0c2c39..523c2f615 100644
--- a/src/dfa.rs
+++ b/src/dfa.rs
@@ -43,7 +43,7 @@ use std::mem;
 use std::sync::Arc;
 
 use exec::ProgramCache;
-use prog::{Inst, Program};
+use prog::{BytesInst, InstTrait, Program};
 use sparse::SparseSet;
 
 /// Return true if and only if the given program can be executed by a DFA.
@@ -54,8 +54,7 @@ use sparse::SparseSet;
 ///
 /// This function will also return false if the given program has any Unicode
 /// instructions (Char or Ranges) since the DFA operates on bytes only.
-pub fn can_exec(insts: &Program) -> bool {
-    use prog::Inst::*;
+pub fn can_exec(insts: &Program<BytesInst>) -> bool {
     // If for some reason we manage to allocate a regex program with more
     // than i32::MAX instructions, then we can't execute the DFA because we
     // use 32 bit instruction pointer deltas for memory savings.
@@ -65,12 +64,6 @@ pub fn can_exec(insts: &Program) -> bool {
     if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize {
         return false;
     }
-    for inst in insts {
-        match *inst {
-            Char(_) | Ranges(_) => return false,
-            EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {}
-        }
-    }
     true
 }
 
@@ -172,7 +165,7 @@ pub struct Fsm<'a> {
     /// the `dfa` instructions or the `dfa_reverse` instructions from
     /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have
     /// Unicode opcodes that cannot be executed by the DFA.)
-    prog: &'a Program,
+    prog: &'a Program<BytesInst>,
     /// The start state. We record it here because the pointer may change
     /// when the cache is wiped.
     start: StatePtr,
@@ -411,7 +404,7 @@ struct StateFlags(u8);
 
 impl Cache {
     /// Create new empty cache for the DFA engine.
-    pub fn new(prog: &Program) -> Self {
+    pub fn new(prog: &Program<BytesInst>) -> Self {
         // We add 1 to account for the special EOF byte.
         let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1;
         let starts = vec![STATE_UNKNOWN; 256];
@@ -445,7 +438,7 @@ impl CacheInner {
 impl<'a> Fsm<'a> {
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn forward(
-        prog: &'a Program,
+        prog: &'a Program<BytesInst>,
         cache: &ProgramCache,
         quit_after_match: bool,
         text: &[u8],
@@ -475,7 +468,7 @@ impl<'a> Fsm<'a> {
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn reverse(
-        prog: &'a Program,
+        prog: &'a Program<BytesInst>,
         cache: &ProgramCache,
         quit_after_match: bool,
         text: &[u8],
@@ -505,7 +498,7 @@ impl<'a> Fsm<'a> {
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     pub fn forward_many(
-        prog: &'a Program,
+        prog: &'a Program<BytesInst>,
         cache: &ProgramCache,
         matches: &mut [bool],
         text: &[u8],
@@ -539,7 +532,7 @@ impl<'a> Fsm<'a> {
                 debug_assert!(dfa.last_match_si != STATE_UNKNOWN);
                 debug_assert!(dfa.last_match_si != STATE_DEAD);
                 for ip in dfa.state(dfa.last_match_si).inst_ptrs() {
-                    if let Inst::Match(slot) = dfa.prog[ip] {
+                    if let BytesInst::Match(slot) = dfa.prog[ip] {
                         matches[slot] = true;
                     }
                 }
@@ -894,7 +887,7 @@ impl<'a> Fsm<'a> {
         mut si: StatePtr,
         b: Byte,
     ) -> Option<StatePtr> {
-        use prog::Inst::*;
+        use prog::BytesInst::*;
 
         // Initialize a queue with the current DFA state's NFA states.
         qcur.clear();
@@ -957,8 +950,6 @@ impl<'a> Fsm<'a> {
         qnext.clear();
         for &ip in &*qcur {
             match self.prog[ip as usize] {
-                // These states never happen in a byte-based program.
-                Char(_) | Ranges(_) => unreachable!(),
                 // These states are handled when following epsilon transitions.
                 Save(_) | Split(_) | EmptyLook(_) => {}
                 Match(_) => {
@@ -1056,8 +1047,8 @@ impl<'a> Fsm<'a> {
         q: &mut SparseSet,
         flags: EmptyFlags,
     ) {
+        use prog::BytesInst::*;
         use prog::EmptyLook::*;
-        use prog::Inst::*;
 
         // We need to traverse the NFA to follow epsilon transitions, so avoid
         // recursion with an explicit stack.
@@ -1072,7 +1063,6 @@ impl<'a> Fsm<'a> {
                 }
                 q.insert(ip as usize);
                 match self.prog[ip as usize] {
-                    Char(_) | Ranges(_) => unreachable!(),
                     Match(_) | Bytes(_) => {
                         break;
                     }
@@ -1190,7 +1180,7 @@ impl<'a> Fsm<'a> {
         q: &SparseSet,
         state_flags: &mut StateFlags,
     ) -> Option<State> {
-        use prog::Inst::*;
+        use prog::BytesInst::*;
 
         // We need to build up enough information to recognize pre-built states
         // in the DFA. Generally speaking, this includes every instruction
@@ -1211,7 +1201,6 @@ impl<'a> Fsm<'a> {
         for &ip in q {
             let ip = usize_to_u32(ip);
             match self.prog[ip as usize] {
-                Char(_) | Ranges(_) => unreachable!(),
                 Save(_) | Split(_) => {}
                 Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip),
                 EmptyLook(_) => {
diff --git a/src/exec.rs b/src/exec.rs
index 3d5a52bea..2348e357f 100644
--- a/src/exec.rs
+++ b/src/exec.rs
@@ -18,7 +18,7 @@ use input::{ByteInput, CharInput};
 use literal::LiteralSearcher;
 use pikevm;
 use pool::{Pool, PoolGuard};
-use prog::Program;
+use prog::{BytesInst, Program, UnicodeInst};
 use re_builder::RegexOptions;
 use re_bytes;
 use re_set;
@@ -61,6 +61,76 @@ pub struct ExecNoSync<'c> {
 #[derive(Debug)]
 pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>);
 
+#[derive(Debug)]
+enum NfaProgram {
+    Bytes(Program<BytesInst>),
+    Unicode(Program<UnicodeInst>),
+}
+
+impl NfaProgram {
+    #[inline]
+    fn len(&self) -> usize {
+        match self {
+            NfaProgram::Bytes(program) => program.len(),
+            NfaProgram::Unicode(program) => program.len(),
+        }
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        match self {
+            NfaProgram::Bytes(program) => program.is_empty(),
+            NfaProgram::Unicode(program) => program.is_empty(),
+        }
+    }
+
+    #[inline]
+    fn is_anchored_start(&self) -> bool {
+        match self {
+            NfaProgram::Bytes(program) => program.is_anchored_start,
+            NfaProgram::Unicode(program) => program.is_anchored_start,
+        }
+    }
+
+    #[inline]
+    fn is_anchored_end(&self) -> bool {
+        match self {
+            NfaProgram::Bytes(program) => program.is_anchored_end,
+            NfaProgram::Unicode(program) => program.is_anchored_end,
+        }
+    }
+
+    #[inline]
+    fn only_utf8(&self) -> bool {
+        match self {
+            NfaProgram::Bytes(program) => program.only_utf8,
+            NfaProgram::Unicode(program) => program.only_utf8,
+        }
+    }
+
+    #[inline]
+    fn prefixes(&self) -> &LiteralSearcher {
+        match self {
+            NfaProgram::Bytes(program) => &program.prefixes,
+            NfaProgram::Unicode(program) => &program.prefixes,
+        }
+    }
+
+    pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
+        match self {
+            NfaProgram::Bytes(program) => &program.capture_name_idx,
+            NfaProgram::Unicode(program) => &program.capture_name_idx,
+        }
+    }
+
+    pub fn captures(&self) -> &[Option<String>] {
+        match self {
+            NfaProgram::Bytes(program) => &program.captures,
+            NfaProgram::Unicode(program) => &program.captures,
+        }
+    }
+}
+
 /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such
 /// state is determined at compile time and never changes during search.
 #[derive(Debug)]
@@ -72,17 +142,17 @@ struct ExecReadOnly {
     ///
     /// N.B. It is not possibly to make this byte-based from the public API.
     /// It is only used for testing byte based programs in the NFA simulations.
-    nfa: Program,
+    nfa: NfaProgram,
     /// A compiled byte based program for DFA execution. This is only used
     /// if a DFA can be executed. (Currently, only word boundary assertions are
     /// not supported.) Note that this program contains an embedded `.*?`
     /// preceding the first capture group, unless the regex is anchored at the
     /// beginning.
-    dfa: Program,
+    dfa: Program<BytesInst>,
     /// The same as above, except the program is reversed (and there is no
     /// preceding `.*?`). This is used by the DFA to find the starting location
     /// of matches.
-    dfa_reverse: Program,
+    dfa_reverse: Program<BytesInst>,
     /// A set of suffix literals extracted from the regex.
     ///
     /// Prefix literals are stored on the `Program`, since they are used inside
@@ -302,7 +372,7 @@ impl ExecBuilder {
         if self.options.pats.is_empty() {
             let ro = Arc::new(ExecReadOnly {
                 res: vec![],
-                nfa: Program::new(),
+                nfa: NfaProgram::Unicode(Program::new()),
                 dfa: Program::new(),
                 dfa_reverse: Program::new(),
                 suffixes: LiteralSearcher::empty(),
@@ -314,11 +384,6 @@ impl ExecBuilder {
             return Ok(Exec { ro: ro, pool });
         }
         let parsed = self.parse()?;
-        let mut nfa = Compiler::new()
-            .size_limit(self.options.size_limit)
-            .bytes(self.bytes || parsed.bytes)
-            .only_utf8(self.only_utf8)
-            .compile(&parsed.exprs)?;
         let mut dfa = Compiler::new()
             .size_limit(self.options.size_limit)
             .dfa(true)
@@ -333,8 +398,25 @@ impl ExecBuilder {
 
         #[cfg(feature = "perf-literal")]
         let ac = self.build_aho_corasick(&parsed);
-        nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
-        dfa.prefixes = nfa.prefixes.clone();
+
+        let nfa = if self.bytes || parsed.bytes {
+            let mut program = Compiler::new()
+                .size_limit(self.options.size_limit)
+                .only_utf8(self.only_utf8)
+                .compile(&parsed.exprs)?;
+            program.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
+            dfa.prefixes = program.prefixes.clone();
+            NfaProgram::Bytes(program)
+        } else {
+            let mut program = Compiler::new()
+                .size_limit(self.options.size_limit)
+                .only_utf8(self.only_utf8)
+                .compile(&parsed.exprs)?;
+            program.prefixes = LiteralSearcher::prefixes(parsed.prefixes);
+            dfa.prefixes = program.prefixes.clone();
+            NfaProgram::Unicode(program)
+        };
+
         dfa.dfa_size_limit = self.options.dfa_size_limit;
         dfa_reverse.dfa_size_limit = self.options.dfa_size_limit;
 
@@ -428,7 +510,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
     /// are two slots for every capture group, corresponding to possibly empty
     /// start and end locations of the capture.)
     fn slots_len(&self) -> usize {
-        self.ro.nfa.captures.len() * 2
+        self.ro.nfa.captures().len() * 2
     }
 
     fn next_after_empty(&self, _text: &[u8], i: usize) -> usize {
@@ -630,7 +712,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
             }
             #[cfg(feature = "perf-dfa")]
             MatchType::Dfa => {
-                if self.ro.nfa.is_anchored_start {
+                if self.ro.nfa.is_anchored_start() {
                     self.captures_nfa(slots, text, start)
                 } else {
                     match self.find_dfa_forward(text, start) {
@@ -701,12 +783,12 @@ impl<'c> ExecNoSync<'c> {
         use self::MatchLiteralType::*;
         match ty {
             Unanchored => {
-                let lits = &self.ro.nfa.prefixes;
+                let lits = self.ro.nfa.prefixes();
                 lits.find(&text[start..]).map(|(s, e)| (start + s, start + e))
             }
             AnchoredStart => {
-                let lits = &self.ro.nfa.prefixes;
-                if start == 0 || !self.ro.nfa.is_anchored_start {
+                let lits = self.ro.nfa.prefixes();
+                if start == 0 || !self.ro.nfa.is_anchored_start() {
                     lits.find_start(&text[start..])
                         .map(|(s, e)| (start + s, start + e))
                 } else {
@@ -1086,20 +1168,19 @@ impl<'c> ExecNoSync<'c> {
         start: usize,
         end: usize,
     ) -> bool {
-        if self.ro.nfa.uses_bytes() {
-            pikevm::Fsm::exec(
-                &self.ro.nfa,
+        match self.ro.nfa {
+            NfaProgram::Bytes(ref program) => pikevm::Fsm::exec(
+                program,
                 self.cache.value(),
                 matches,
                 slots,
                 quit_after_match,
-                ByteInput::new(text, self.ro.nfa.only_utf8),
+                ByteInput::new(text, self.ro.nfa.only_utf8()),
                 start,
                 end,
-            )
-        } else {
-            pikevm::Fsm::exec(
-                &self.ro.nfa,
+            ),
+            NfaProgram::Unicode(ref program) => pikevm::Fsm::exec(
+                program,
                 self.cache.value(),
                 matches,
                 slots,
@@ -1107,7 +1188,7 @@ impl<'c> ExecNoSync<'c> {
                 CharInput::new(text),
                 start,
                 end,
-            )
+            ),
         }
     }
 
@@ -1120,26 +1201,25 @@ impl<'c> ExecNoSync<'c> {
         start: usize,
         end: usize,
     ) -> bool {
-        if self.ro.nfa.uses_bytes() {
-            backtrack::Bounded::exec(
-                &self.ro.nfa,
+        match self.ro.nfa {
+            NfaProgram::Bytes(ref program) => backtrack::Bounded::exec(
+                program,
                 self.cache.value(),
                 matches,
                 slots,
-                ByteInput::new(text, self.ro.nfa.only_utf8),
+                ByteInput::new(text, self.ro.nfa.only_utf8()),
                 start,
                 end,
-            )
-        } else {
-            backtrack::Bounded::exec(
-                &self.ro.nfa,
+            ),
+            NfaProgram::Unicode(ref program) => backtrack::Bounded::exec(
+                program,
                 self.cache.value(),
                 matches,
                 slots,
                 CharInput::new(text),
                 start,
                 end,
-            )
+            ),
         }
     }
 
@@ -1237,7 +1317,7 @@ impl<'c> ExecNoSync<'c> {
         #[cfg(feature = "perf-literal")]
         fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool {
             // Only do this check if the haystack is big (>1MB).
-            if text.len() > (1 << 20) && ro.nfa.is_anchored_end {
+            if text.len() > (1 << 20) && ro.nfa.is_anchored_end() {
                 let lcs = ro.suffixes.lcs();
                 if lcs.len() >= 1 && !lcs.is_suffix(text) {
                     return false;
@@ -1250,7 +1330,7 @@ impl<'c> ExecNoSync<'c> {
     }
 
     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
-        &self.ro.nfa.capture_name_idx
+        &self.ro.nfa.capture_name_idx()
     }
 }
 
@@ -1306,13 +1386,13 @@ impl Exec {
     ///
     /// Any capture that isn't named is None.
     pub fn capture_names(&self) -> &[Option<String>] {
-        &self.ro.nfa.captures
+        &self.ro.nfa.captures()
     }
 
     /// Return a reference to named groups mapping (from group name to
     /// group position).
     pub fn capture_name_idx(&self) -> &Arc<HashMap<String, usize>> {
-        &self.ro.nfa.capture_name_idx
+        &self.ro.nfa.capture_name_idx()
     }
 }
 
@@ -1329,7 +1409,7 @@ impl ExecReadOnly {
             return hint.unwrap();
         }
         // If the NFA is empty, then we'll never match anything.
-        if self.nfa.insts.is_empty() {
+        if self.nfa.is_empty() {
             return MatchType::Nothing;
         }
         if let Some(literalty) = self.choose_literal_match_type() {
@@ -1371,15 +1451,15 @@ impl ExecReadOnly {
                     MatchLiteralType::AhoCorasick,
                 ));
             }
-            if ro.nfa.prefixes.complete() {
-                return if ro.nfa.is_anchored_start {
+            if ro.nfa.prefixes().complete() {
+                return if ro.nfa.is_anchored_start() {
                     Some(MatchType::Literal(MatchLiteralType::AnchoredStart))
                 } else {
                     Some(MatchType::Literal(MatchLiteralType::Unanchored))
                 };
             }
             if ro.suffixes.complete() {
-                return if ro.nfa.is_anchored_end {
+                return if ro.nfa.is_anchored_end() {
                     Some(MatchType::Literal(MatchLiteralType::AnchoredEnd))
                 } else {
                     // This case shouldn't happen. When the regex isn't
@@ -1412,7 +1492,7 @@ impl ExecReadOnly {
             }
             // If the regex is anchored at the end but not the start, then
             // just match in reverse from the end of the haystack.
-            if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end {
+            if !ro.nfa.is_anchored_start() && ro.nfa.is_anchored_end() {
                 return Some(MatchType::DfaAnchoredReverse);
             }
             #[cfg(feature = "perf-literal")]
@@ -1536,8 +1616,20 @@ pub struct ProgramCacheInner {
 impl ProgramCacheInner {
     fn new(ro: &ExecReadOnly) -> Self {
         ProgramCacheInner {
-            pikevm: pikevm::Cache::new(&ro.nfa),
-            backtrack: backtrack::Cache::new(&ro.nfa),
+            pikevm: match ro.nfa {
+                NfaProgram::Bytes(ref program) => pikevm::Cache::new(program),
+                NfaProgram::Unicode(ref program) => {
+                    pikevm::Cache::new(program)
+                }
+            },
+            backtrack: match ro.nfa {
+                NfaProgram::Bytes(ref program) => {
+                    backtrack::Cache::new(program)
+                }
+                NfaProgram::Unicode(ref program) => {
+                    backtrack::Cache::new(program)
+                }
+            },
             #[cfg(feature = "perf-dfa")]
             dfa: dfa::Cache::new(&ro.dfa),
             #[cfg(feature = "perf-dfa")]
diff --git a/src/lib.rs b/src/lib.rs
index 357ac0dd0..15a63f204 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -781,5 +781,5 @@ pub mod internal {
     pub use exec::{Exec, ExecBuilder};
     pub use input::{Char, CharInput, Input, InputAt};
     pub use literal::LiteralSearcher;
-    pub use prog::{EmptyLook, Inst, InstRanges, Program};
+    pub use prog::{BytesInst, EmptyLook, InstRanges, Program, UnicodeInst};
 }
diff --git a/src/pikevm.rs b/src/pikevm.rs
index 299087da8..f6cd88be7 100644
--- a/src/pikevm.rs
+++ b/src/pikevm.rs
@@ -19,17 +19,17 @@ use std::mem;
 
 use exec::ProgramCache;
 use input::{Input, InputAt};
-use prog::{InstPtr, Program};
+use prog::{BytesInst, InstPtr, InstTrait, Program, UnicodeInst};
 use re_trait::Slot;
 use sparse::SparseSet;
 
 /// An NFA simulation matching engine.
 #[derive(Debug)]
-pub struct Fsm<'r, I> {
+pub struct Fsm<'r, I, P: InstTrait> {
     /// The sequence of opcodes (among other things) that is actually executed.
     ///
     /// The program may be byte oriented or Unicode codepoint oriented.
-    prog: &'r Program,
+    prog: &'r Program<P>,
     /// An explicit stack used for following epsilon transitions. (This is
     /// borrowed from the cache.)
     stack: &'r mut Vec<FollowEpsilon>,
@@ -49,7 +49,7 @@ pub struct Cache {
 
 /// An ordered set of NFA states and their captures.
 #[derive(Clone, Debug)]
-struct Threads {
+pub struct Threads {
     /// An ordered set of opcodes (each opcode is an NFA state).
     set: SparseSet,
     /// Captures for every NFA state.
@@ -75,18 +75,18 @@ enum FollowEpsilon {
 impl Cache {
     /// Create a new allocation used by the NFA machine to record execution
     /// and captures.
-    pub fn new(_prog: &Program) -> Self {
+    pub fn new<I: InstTrait>(_prog: &Program<I>) -> Self {
         Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] }
     }
 }
 
-impl<'r, I: Input> Fsm<'r, I> {
+impl<'r, I: Input, P: InstTrait + Step> Fsm<'r, I, P> {
     /// Execute the NFA matching engine.
     ///
     /// If there's a match, `exec` returns `true` and populates the given
     /// captures accordingly.
     pub fn exec(
-        prog: &'r Program,
+        prog: &'r Program<P>,
         cache: &ProgramCache,
         matches: &mut [bool],
         slots: &mut [Slot],
@@ -231,39 +231,15 @@ impl<'r, I: Input> Fsm<'r, I> {
         at: InputAt,
         at_next: InputAt,
     ) -> bool {
-        use prog::Inst::*;
-        match self.prog[ip] {
-            Match(match_slot) => {
-                if match_slot < matches.len() {
-                    matches[match_slot] = true;
-                }
-                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
-                    *slot = *val;
-                }
-                true
-            }
-            Char(ref inst) => {
-                if inst.c == at.char() {
-                    self.add(nlist, thread_caps, inst.goto, at_next);
-                }
-                false
-            }
-            Ranges(ref inst) => {
-                if inst.matches(at.char()) {
-                    self.add(nlist, thread_caps, inst.goto, at_next);
-                }
-                false
-            }
-            Bytes(ref inst) => {
-                if let Some(b) = at.byte() {
-                    if inst.matches(b) {
-                        self.add(nlist, thread_caps, inst.goto, at_next);
-                    }
-                }
-                false
-            }
-            EmptyLook(_) | Save(_) | Split(_) => false,
-        }
+        self.prog[ip].step(
+            self,
+            nlist,
+            matches,
+            slots,
+            thread_caps,
+            at,
+            at_next,
+        )
     }
 
     /// Follows epsilon transitions and adds them for processing to nlist,
@@ -300,40 +276,196 @@ impl<'r, I: Input> Fsm<'r, I> {
         // traverse the set of states. We only push to the stack when we
         // absolutely need recursion (restoring captures or following a
         // branch).
-        use prog::Inst::*;
         loop {
             // Don't visit states we've already added.
             if nlist.set.contains(ip) {
                 return;
             }
             nlist.set.insert(ip);
-            match self.prog[ip] {
-                EmptyLook(ref inst) => {
-                    if self.input.is_empty_match(at, inst) {
-                        ip = inst.goto;
-                    }
+            if let Some(next_ip) =
+                self.prog[ip].add_step(self, nlist, thread_caps, ip, at)
+            {
+                ip = next_ip;
+            } else {
+                return;
+            }
+        }
+    }
+}
+
+pub trait Step: InstTrait + Sized {
+    fn step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, Self>,
+        nlist: &mut Threads,
+        matches: &mut [bool],
+        slots: &mut [Slot],
+        thread_caps: &mut [Option<usize>],
+        at: InputAt,
+        at_next: InputAt,
+    ) -> bool;
+
+    fn add_step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, Self>,
+        nlist: &mut Threads,
+        thread_caps: &mut [Option<usize>],
+        ip: usize,
+        at: InputAt,
+    ) -> Option<usize>;
+}
+
+impl Step for UnicodeInst {
+    fn step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, UnicodeInst>,
+        nlist: &mut Threads,
+        matches: &mut [bool],
+        slots: &mut [Slot],
+        thread_caps: &mut [Option<usize>],
+        at: InputAt,
+        at_next: InputAt,
+    ) -> bool {
+        use prog::UnicodeInst::*;
+        match *self {
+            Match(match_slot) => {
+                if match_slot < matches.len() {
+                    matches[match_slot] = true;
                 }
-                Save(ref inst) => {
-                    if inst.slot < thread_caps.len() {
-                        self.stack.push(FollowEpsilon::Capture {
-                            slot: inst.slot,
-                            pos: thread_caps[inst.slot],
-                        });
-                        thread_caps[inst.slot] = Some(at.pos());
-                    }
-                    ip = inst.goto;
+                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
+                    *slot = *val;
                 }
-                Split(ref inst) => {
-                    self.stack.push(FollowEpsilon::IP(inst.goto2));
-                    ip = inst.goto1;
+                true
+            }
+            Char(ref inst) => {
+                if inst.c == at.char() {
+                    fsm.add(nlist, thread_caps, inst.goto, at_next);
+                }
+                false
+            }
+            Ranges(ref inst) => {
+                if inst.matches(at.char()) {
+                    fsm.add(nlist, thread_caps, inst.goto, at_next);
+                }
+                false
+            }
+            EmptyLook(_) | Save(_) | Split(_) => false,
+        }
+    }
+
+    fn add_step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, UnicodeInst>,
+        nlist: &mut Threads,
+        thread_caps: &mut [Option<usize>],
+        ip: usize,
+        at: InputAt,
+    ) -> Option<usize> {
+        use prog::UnicodeInst::*;
+        match *self {
+            EmptyLook(ref inst) => {
+                if fsm.input.is_empty_match(at, inst) {
+                    Some(inst.goto)
+                } else {
+                    Some(ip)
+                }
+            }
+            Save(ref inst) => {
+                if inst.slot < thread_caps.len() {
+                    fsm.stack.push(FollowEpsilon::Capture {
+                        slot: inst.slot,
+                        pos: thread_caps[inst.slot],
+                    });
+                    thread_caps[inst.slot] = Some(at.pos());
+                }
+                Some(inst.goto)
+            }
+            Split(ref inst) => {
+                fsm.stack.push(FollowEpsilon::IP(inst.goto2));
+                Some(inst.goto1)
+            }
+            Match(_) | Char(_) | Ranges(_) => {
+                let t = &mut nlist.caps(ip);
+                for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
+                    *slot = *val;
+                }
+                None
+            }
+        }
+    }
+}
+
+impl Step for BytesInst {
+    fn step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, BytesInst>,
+        nlist: &mut Threads,
+        matches: &mut [bool],
+        slots: &mut [Slot],
+        thread_caps: &mut [Option<usize>],
+        at: InputAt,
+        at_next: InputAt,
+    ) -> bool {
+        use prog::BytesInst::*;
+        match *self {
+            Match(match_slot) => {
+                if match_slot < matches.len() {
+                    matches[match_slot] = true;
                 }
-                Match(_) | Char(_) | Ranges(_) | Bytes(_) => {
-                    let t = &mut nlist.caps(ip);
-                    for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
-                        *slot = *val;
+                for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) {
+                    *slot = *val;
+                }
+                true
+            }
+            Bytes(ref inst) => {
+                if let Some(b) = at.byte() {
+                    if inst.matches(b) {
+                        fsm.add(nlist, thread_caps, inst.goto, at_next);
                     }
-                    return;
                 }
+                false
+            }
+            EmptyLook(_) | Save(_) | Split(_) => false,
+        }
+    }
+
+    fn add_step<'r, I: Input>(
+        &self,
+        fsm: &mut Fsm<'r, I, BytesInst>,
+        nlist: &mut Threads,
+        thread_caps: &mut [Option<usize>],
+        ip: usize,
+        at: InputAt,
+    ) -> Option<usize> {
+        use prog::BytesInst::*;
+        match *self {
+            EmptyLook(ref inst) => {
+                if fsm.input.is_empty_match(at, inst) {
+                    Some(inst.goto)
+                } else {
+                    Some(ip)
+                }
+            }
+            Save(ref inst) => {
+                if inst.slot < thread_caps.len() {
+                    fsm.stack.push(FollowEpsilon::Capture {
+                        slot: inst.slot,
+                        pos: thread_caps[inst.slot],
+                    });
+                    thread_caps[inst.slot] = Some(at.pos());
+                }
+                Some(inst.goto)
+            }
+            Split(ref inst) => {
+                fsm.stack.push(FollowEpsilon::IP(inst.goto2));
+                Some(inst.goto1)
+            }
+            Match(_) | Bytes(_) => {
+                let t = &mut nlist.caps(ip);
+                for (slot, val) in t.iter_mut().zip(thread_caps.iter()) {
+                    *slot = *val;
+                }
+                None
             }
         }
     }
diff --git a/src/prog.rs b/src/prog.rs
index 74e5f2f6f..640b71b38 100644
--- a/src/prog.rs
+++ b/src/prog.rs
@@ -15,9 +15,9 @@ pub type InstPtr = usize;
 /// Program is a sequence of instructions and various facts about thos
 /// instructions.
 #[derive(Clone)]
-pub struct Program {
+pub struct Program<I: InstTrait> {
     /// A sequence of instructions that represents an NFA.
-    pub insts: Vec<Inst>,
+    pub insts: Vec<I>,
     /// Pointers to each Match instruction in the sequence.
     ///
     /// This is always length 1 unless this program represents a regex set.
@@ -38,9 +38,6 @@ pub struct Program {
     pub byte_classes: Vec<u8>,
     /// When true, this program can only match valid UTF-8.
     pub only_utf8: bool,
-    /// When true, this program uses byte range instructions instead of Unicode
-    /// range instructions.
-    pub is_bytes: bool,
     /// When true, the program is compiled for DFA matching. For example, this
     /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored
     /// regexes.
@@ -74,7 +71,7 @@ pub struct Program {
     pub dfa_size_limit: usize,
 }
 
-impl Program {
+impl<I: InstTrait> Program<I> {
     /// Creates an empty instruction sequence. Fields are given default
     /// values.
     pub fn new() -> Self {
@@ -86,7 +83,6 @@ impl Program {
             start: 0,
             byte_classes: vec![0; 256],
             only_utf8: true,
-            is_bytes: false,
             is_dfa: false,
             is_reverse: false,
             is_anchored_start: false,
@@ -101,9 +97,9 @@ impl Program {
     /// next pc that is not a no-op instruction.
     pub fn skip(&self, mut pc: usize) -> usize {
         loop {
-            match self[pc] {
-                Inst::Save(ref i) => pc = i.goto,
-                _ => return pc,
+            match self[pc].save_goto() {
+                Some(goto) => pc = goto,
+                None => return pc,
             }
         }
     }
@@ -117,10 +113,7 @@ impl Program {
             // meaningless.
             return false;
         }
-        match self[self.skip(pc)] {
-            Inst::Match(_) => true,
-            _ => false,
-        }
+        self[self.skip(pc)].is_match()
     }
 
     /// Returns true if the current configuration demands that an implicit
@@ -132,7 +125,7 @@ impl Program {
     /// Returns true if this program uses Byte instructions instead of
     /// Char/Range instructions.
     pub fn uses_bytes(&self) -> bool {
-        self.is_bytes || self.is_dfa
+        I::IS_BYTES || self.is_dfa
     }
 
     /// Returns true if this program exclusively matches valid UTF-8 bytes.
@@ -148,7 +141,7 @@ impl Program {
         // The only instruction that uses heap space is Ranges (for
         // Unicode codepoint programs) to store non-overlapping codepoint
         // ranges. To keep this operation constant time, we ignore them.
-        (self.len() * mem::size_of::<Inst>())
+        (self.len() * mem::size_of::<I>())
             + (self.matches.len() * mem::size_of::<InstPtr>())
             + (self.captures.len() * mem::size_of::<Option<String>>())
             + (self.capture_name_idx.len()
@@ -158,8 +151,8 @@ impl Program {
     }
 }
 
-impl Deref for Program {
-    type Target = [Inst];
+impl<I: InstTrait> Deref for Program<I> {
+    type Target = [I];
 
     #[cfg_attr(feature = "perf-inline", inline(always))]
     fn deref(&self) -> &Self::Target {
@@ -167,67 +160,13 @@ impl Deref for Program {
     }
 }
 
-impl fmt::Debug for Program {
+impl<I: InstTrait> fmt::Debug for Program<I> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        use self::Inst::*;
-
-        fn with_goto(cur: usize, goto: usize, fmtd: String) -> String {
-            if goto == cur + 1 {
-                fmtd
-            } else {
-                format!("{} (goto: {})", fmtd, goto)
-            }
-        }
-
-        fn visible_byte(b: u8) -> String {
-            use std::ascii::escape_default;
-            let escaped = escape_default(b).collect::<Vec<u8>>();
-            String::from_utf8_lossy(&escaped).into_owned()
-        }
-
         for (pc, inst) in self.iter().enumerate() {
-            match *inst {
-                Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?,
-                Save(ref inst) => {
-                    let s = format!("{:04} Save({})", pc, inst.slot);
-                    write!(f, "{}", with_goto(pc, inst.goto, s))?;
-                }
-                Split(ref inst) => {
-                    write!(
-                        f,
-                        "{:04} Split({}, {})",
-                        pc, inst.goto1, inst.goto2
-                    )?;
-                }
-                EmptyLook(ref inst) => {
-                    let s = format!("{:?}", inst.look);
-                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
-                }
-                Char(ref inst) => {
-                    let s = format!("{:?}", inst.c);
-                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
-                }
-                Ranges(ref inst) => {
-                    let ranges = inst
-                        .ranges
-                        .iter()
-                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
-                        .collect::<Vec<String>>()
-                        .join(", ");
-                    write!(
-                        f,
-                        "{:04} {}",
-                        pc,
-                        with_goto(pc, inst.goto, ranges)
-                    )?;
-                }
-                Bytes(ref inst) => {
-                    let s = format!(
-                        "Bytes({}, {})",
-                        visible_byte(inst.start),
-                        visible_byte(inst.end)
-                    );
-                    write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?;
+            write!(f, "{:04} {:?}", pc, inst)?;
+            if let Some(goto) = inst.goto() {
+                if pc + 1 == goto {
+                    write!(f, " (goto: {})", goto)?;
                 }
             }
             if pc == self.start {
@@ -239,33 +178,34 @@ impl fmt::Debug for Program {
     }
 }
 
-impl<'a> IntoIterator for &'a Program {
-    type Item = &'a Inst;
-    type IntoIter = slice::Iter<'a, Inst>;
+impl<'a, I: InstTrait> IntoIterator for &'a Program<I> {
+    type Item = &'a I;
+    type IntoIter = slice::Iter<'a, I>;
     fn into_iter(self) -> Self::IntoIter {
         self.iter()
     }
 }
 
-/// Inst is an instruction code in a Regex program.
+/// `InstTrait` represents an instruction code in a Regex program.
 ///
 /// Regrettably, a regex program either contains Unicode codepoint
-/// instructions (Char and Ranges) or it contains byte instructions (Bytes).
+/// instructions (Char and Ranges: [`UnicodeInst`]) or it contains
+/// byte instructions (Bytes: [`BytesInst`]).
 /// A regex program can never contain both.
-///
-/// It would be worth investigating splitting this into two distinct types and
-/// then figuring out how to make the matching engines polymorphic over those
-/// types without sacrificing performance.
-///
-/// Other than the benefit of moving invariants into the type system, another
-/// benefit is the decreased size. If we remove the `Char` and `Ranges`
-/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to
-/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges`
-/// variant.) Given that byte based machines are typically much bigger than
-/// their Unicode analogues (because they can decode UTF-8 directly), this ends
-/// up being a pretty significant savings.
-#[derive(Clone, Debug)]
-pub enum Inst {
+pub trait InstTrait: fmt::Debug {
+    const IS_BYTES: bool;
+
+    /// Returns true if and only if this is a match instruction.
+    fn is_match(&self) -> bool;
+    fn goto(&self) -> Option<usize>;
+    fn save_goto(&self) -> Option<usize>;
+    fn new_match(i: usize) -> Self;
+    fn new_split(split: InstSplit) -> Self;
+}
+
+/// A Unicode codepoint instruction.
+#[derive(Clone)]
+pub enum UnicodeInst {
     /// Match indicates that the program has reached a match state.
     ///
     /// The number in the match corresponds to the Nth logical regular
@@ -289,20 +229,173 @@ pub enum Inst {
     /// Ranges requires the regex program to match the character at the current
     /// position in the input with one of the ranges specified in InstRanges.
     Ranges(InstRanges),
+}
+
+impl InstTrait for UnicodeInst {
+    const IS_BYTES: bool = false;
+
+    #[inline]
+    fn is_match(&self) -> bool {
+        match *self {
+            Self::Match(_) => true,
+            _ => false,
+        }
+    }
+
+    #[inline]
+    fn goto(&self) -> Option<usize> {
+        match self {
+            Self::Match(_) => None,
+            Self::Save(ref inst) => Some(inst.goto),
+            Self::Split(_) => None,
+            Self::EmptyLook(ref inst) => Some(inst.goto),
+            Self::Char(ref inst) => Some(inst.goto),
+            Self::Ranges(ref inst) => Some(inst.goto),
+        }
+    }
+
+    #[inline]
+    fn save_goto(&self) -> Option<usize> {
+        match self {
+            Self::Save(ref inst) => Some(inst.goto),
+            _ => None,
+        }
+    }
+
+    #[inline]
+    fn new_match(i: usize) -> Self {
+        Self::Match(i)
+    }
+
+    #[inline]
+    fn new_split(split: InstSplit) -> Self {
+        Self::Split(split)
+    }
+}
+
+impl fmt::Debug for UnicodeInst {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::Match(slot) => write!(f, "Match({:?})", slot),
+            Self::Save(ref inst) => write!(f, "Save({})", inst.slot),
+            Self::Split(ref inst) => {
+                write!(f, "Split({}, {})", inst.goto1, inst.goto2)
+            }
+            Self::EmptyLook(ref inst) => {
+                write!(f, "{:?}", inst.look)
+            }
+            Self::Char(ref inst) => {
+                write!(f, "{:?}", inst.c)
+            }
+            Self::Ranges(ref inst) => {
+                write!(
+                    f,
+                    "{}",
+                    inst.ranges
+                        .iter()
+                        .map(|r| format!("{:?}-{:?}", r.0, r.1))
+                        .collect::<Vec<String>>()
+                        .join(", ")
+                )
+            }
+        }
+    }
+}
+
+/// A byte instruction.
+#[derive(Clone)]
+pub enum BytesInst {
+    /// Match indicates that the program has reached a match state.
+    ///
+    /// The number in the match corresponds to the Nth logical regular
+    /// expression in this program. This index is always 0 for normal regex
+    /// programs. Values greater than 0 appear when compiling regex sets, and
+    /// each match instruction gets its own unique value. The value corresponds
+    /// to the Nth regex in the set.
+    Match(usize),
+    /// Save causes the program to save the current location of the input in
+    /// the slot indicated by InstSave.
+    Save(InstSave),
+    /// Split causes the program to diverge to one of two paths in the
+    /// program, preferring goto1 in InstSplit.
+    Split(InstSplit),
+    /// EmptyLook represents a zero-width assertion in a regex program. A
+    /// zero-width assertion does not consume any of the input text.
+    EmptyLook(InstEmptyLook),
     /// Bytes is like Ranges, except it expresses a single byte range. It is
     /// used in conjunction with Split instructions to implement multi-byte
     /// character classes.
     Bytes(InstBytes),
 }
 
-impl Inst {
-    /// Returns true if and only if this is a match instruction.
-    pub fn is_match(&self) -> bool {
+impl InstTrait for BytesInst {
+    const IS_BYTES: bool = true;
+
+    #[inline]
+    fn is_match(&self) -> bool {
         match *self {
-            Inst::Match(_) => true,
+            Self::Match(_) => true,
             _ => false,
         }
     }
+
+    #[inline]
+    fn goto(&self) -> Option<usize> {
+        match self {
+            Self::Match(_) => None,
+            Self::Save(ref inst) => Some(inst.goto),
+            Self::Split(_) => None,
+            Self::EmptyLook(ref inst) => Some(inst.goto),
+            Self::Bytes(ref inst) => Some(inst.goto),
+        }
+    }
+
+    #[inline]
+    fn save_goto(&self) -> Option<usize> {
+        match self {
+            Self::Save(ref inst) => Some(inst.goto),
+            _ => None,
+        }
+    }
+
+    #[inline]
+    fn new_match(i: usize) -> Self {
+        Self::Match(i)
+    }
+
+    #[inline]
+    fn new_split(split: InstSplit) -> Self {
+        Self::Split(split)
+    }
+}
+
+impl fmt::Debug for BytesInst {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        fn visible_byte(b: u8) -> String {
+            use std::ascii::escape_default;
+            let escaped = escape_default(b).collect::<Vec<u8>>();
+            String::from_utf8_lossy(&escaped).into_owned()
+        }
+
+        match self {
+            Self::Match(slot) => write!(f, "Match({:?})", slot),
+            Self::Save(ref inst) => write!(f, "Save({})", inst.slot),
+            Self::Split(ref inst) => {
+                write!(f, "Split({}, {})", inst.goto1, inst.goto2)
+            }
+            Self::EmptyLook(ref inst) => {
+                write!(f, "{:?}", inst.look)
+            }
+            Self::Bytes(ref inst) => {
+                write!(
+                    f,
+                    "Bytes({}, {})",
+                    visible_byte(inst.start),
+                    visible_byte(inst.end)
+                )
+            }
+        }
+    }
 }
 
 /// Representation of the Save instruction.
@@ -432,3 +525,17 @@ impl InstBytes {
         self.start <= byte && byte <= self.end
     }
 }
+
+#[cfg(test)]
+mod test {
+    #[test]
+    #[cfg(target_pointer_width = "64")]
+    fn test_size_of_inst() {
+        use std::mem::size_of;
+
+        use super::{BytesInst, UnicodeInst};
+
+        assert_eq!(24, size_of::<BytesInst>());
+        assert_eq!(40, size_of::<UnicodeInst>());
+    }
+}