diff --git a/src/backtrack.rs b/src/backtrack.rs index 6100c1730..c1dff8499 100644 --- a/src/backtrack.rs +++ b/src/backtrack.rs @@ -18,7 +18,7 @@ use exec::ProgramCache; use input::{Input, InputAt}; -use prog::{InstPtr, Program}; +use prog::{BytesInst, InstPtr, InstTrait, Program, UnicodeInst}; use re_trait::Slot; type Bits = u32; @@ -41,8 +41,8 @@ pub fn should_exec(num_insts: usize, text_len: usize) -> bool { /// A backtracking matching engine. #[derive(Debug)] -pub struct Bounded<'a, 'm, 'r, 's, I> { - prog: &'r Program, +pub struct Bounded<'a, 'm, 'r, 's, I, P: InstTrait> { + prog: &'r Program

, input: I, matches: &'m mut [bool], slots: &'s mut [Slot], @@ -59,7 +59,7 @@ pub struct Cache { impl Cache { /// Create new empty cache for the backtracking engine. - pub fn new(_prog: &Program) -> Self { + pub fn new(_prog: &Program) -> Self { Cache { jobs: vec![], visited: vec![] } } } @@ -76,13 +76,15 @@ enum Job { SaveRestore { slot: usize, old_pos: Option }, } -impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { +impl<'a, 'm, 'r, 's, I: Input, P: InstTrait + Step> + Bounded<'a, 'm, 'r, 's, I, P> +{ /// Execute the backtracking matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( - prog: &'r Program, + prog: &'r Program

, cache: &ProgramCache, matches: &'m mut [bool], slots: &'s mut [Slot], @@ -93,14 +95,14 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { let mut cache = cache.borrow_mut(); let cache = &mut cache.backtrack; let start = input.at(start); - let mut b = Bounded { + Bounded { prog: prog, input: input, matches: matches, slots: slots, m: cache, - }; - b.exec_(start, end) + } + .exec_(start, end) } /// Clears the cache such that the backtracking engine can be executed @@ -196,7 +198,6 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { } fn step(&mut self, mut ip: InstPtr, mut at: InputAt) -> bool { - use prog::Inst::*; loop { // This loop is an optimization to avoid constantly pushing/popping // from the stack. Namely, if we're pushing a job only to run it @@ -205,64 +206,12 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { if self.has_visited(ip, at) { return false; } - match self.prog[ip] { - Match(slot) => { - if slot < self.matches.len() { - self.matches[slot] = true; - } - return true; - } - Save(ref inst) => { - if let Some(&old_pos) = self.slots.get(inst.slot) { - // If this path doesn't work out, then we save the old - // capture index (if one exists) in an alternate - // job. If the next path fails, then the alternate - // job is popped and the old capture index is restored. - self.m.jobs.push(Job::SaveRestore { - slot: inst.slot, - old_pos: old_pos, - }); - self.slots[inst.slot] = Some(at.pos()); - } - ip = inst.goto; - } - Split(ref inst) => { - self.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); - ip = inst.goto1; - } - EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { - ip = inst.goto; - } else { - return false; - } - } - Char(ref inst) => { - if inst.c == at.char() { - ip = inst.goto; - at = self.input.at(at.next_pos()); - } else { - return false; - } - } - Ranges(ref inst) => { - if inst.matches(at.char()) { - ip = inst.goto; - at = self.input.at(at.next_pos()); - } else { - return false; - } - } - Bytes(ref inst) => { - if let Some(b) = at.byte() { - if inst.matches(b) { - ip = inst.goto; - at = self.input.at(at.next_pos()); - continue; - } - } - return false; + match self.prog[ip].step(self, at) { + Ok((next_ip, next_at)) => { + ip = next_ip; + at = next_at; } + Err(res) => return res, } } } @@ -280,6 +229,125 @@ impl<'a, 'm, 'r, 's, I: Input> Bounded<'a, 'm, 'r, 's, I> { } } +pub trait Step: InstTrait + Sized { + fn step( + &self, + bounded: &mut Bounded<'_, '_, '_, '_, I, Self>, + at: InputAt, + ) -> Result<(InstPtr, InputAt), bool>; +} + +impl Step for UnicodeInst { + fn step( + &self, + bounded: &mut Bounded<'_, '_, '_, '_, I, Self>, + mut at: InputAt, + ) -> Result<(InstPtr, InputAt), bool> { + use prog::UnicodeInst::*; + match *self { + Match(slot) => { + if slot < bounded.matches.len() { + bounded.matches[slot] = true; + } + Err(true) + } + Save(ref inst) => { + if let Some(&old_pos) = bounded.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + bounded.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos: old_pos, + }); + bounded.slots[inst.slot] = Some(at.pos()); + } + Ok((inst.goto, at)) + } + Split(ref inst) => { + bounded.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); + Ok((inst.goto1, at)) + } + EmptyLook(ref inst) => { + if bounded.input.is_empty_match(at, inst) { + Ok((inst.goto, at)) + } else { + Err(false) + } + } + Char(ref inst) => { + if inst.c == at.char() { + at = bounded.input.at(at.next_pos()); + Ok((inst.goto, at)) + } else { + Err(false) + } + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + at = bounded.input.at(at.next_pos()); + Ok((inst.goto, at)) + } else { + Err(false) + } + } + } + } +} + +impl Step for BytesInst { + fn step( + &self, + bounded: &mut Bounded<'_, '_, '_, '_, I, Self>, + mut at: InputAt, + ) -> Result<(InstPtr, InputAt), bool> { + use prog::BytesInst::*; + match *self { + Match(slot) => { + if slot < bounded.matches.len() { + bounded.matches[slot] = true; + } + Err(true) + } + Save(ref inst) => { + if let Some(&old_pos) = bounded.slots.get(inst.slot) { + // If this path doesn't work out, then we save the old + // capture index (if one exists) in an alternate + // job. If the next path fails, then the alternate + // job is popped and the old capture index is restored. + bounded.m.jobs.push(Job::SaveRestore { + slot: inst.slot, + old_pos: old_pos, + }); + bounded.slots[inst.slot] = Some(at.pos()); + } + Ok((inst.goto, at)) + } + Split(ref inst) => { + bounded.m.jobs.push(Job::Inst { ip: inst.goto2, at: at }); + Ok((inst.goto1, at)) + } + EmptyLook(ref inst) => { + if bounded.input.is_empty_match(at, inst) { + Ok((inst.goto, at)) + } else { + Err(false) + } + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + at = bounded.input.at(at.next_pos()); + return Ok((inst.goto, at)); + } + } + Err(false) + } + } + } +} + fn usize_to_u32(n: usize) -> u32 { if (n as u64) > (::std::u32::MAX as u64) { panic!("BUG: {} is too big to fit into u32", n) diff --git a/src/compile.rs b/src/compile.rs index 9ffd34704..08d9b6849 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fmt; use std::iter; +use std::mem; use std::result; use std::sync::Arc; @@ -9,8 +10,8 @@ use syntax::is_word_byte; use syntax::utf8::{Utf8Range, Utf8Sequence, Utf8Sequences}; use prog::{ - EmptyLook, Inst, InstBytes, InstChar, InstEmptyLook, InstPtr, InstRanges, - InstSave, InstSplit, Program, + BytesInst, EmptyLook, InstBytes, InstChar, InstEmptyLook, InstPtr, + InstRanges, InstSave, InstSplit, InstTrait, Program, UnicodeInst, }; use Error; @@ -29,9 +30,9 @@ struct Patch { // `Compiler` is only public via the `internal` module, so avoid deriving // `Debug`. #[allow(missing_debug_implementations)] -pub struct Compiler { - insts: Vec, - compiled: Program, +pub struct Compiler { + insts: Vec>, + compiled: Program, capture_name_idx: HashMap, num_exprs: usize, size_limit: usize, @@ -40,7 +41,7 @@ pub struct Compiler { byte_classes: ByteClassSet, } -impl Compiler { +impl Compiler { /// Create a new regular expression compiler. /// /// Various options can be set before calling `compile` on an expression. @@ -65,22 +66,6 @@ impl Compiler { self } - /// If bytes is true, then the program is compiled as a byte based - /// automaton, which incorporates UTF-8 decoding into the machine. If it's - /// false, then the automaton is Unicode scalar value based, e.g., an - /// engine utilizing such an automaton is responsible for UTF-8 decoding. - /// - /// The specific invariant is that when returning a byte based machine, - /// the neither the `Char` nor `Ranges` instructions are produced. - /// Conversely, when producing a Unicode scalar value machine, the `Bytes` - /// instruction is never produced. - /// - /// Note that `dfa(true)` implies `bytes(true)`. - pub fn bytes(mut self, yes: bool) -> Self { - self.compiled.is_bytes = yes; - self - } - /// When disabled, the program compiled may match arbitrary bytes. /// /// When enabled (the default), all compiled programs exclusively match @@ -108,13 +93,18 @@ impl Compiler { self.compiled.is_reverse = yes; self } +} +impl> Compiler { /// Compile a regular expression given its AST. /// /// The compiler is guaranteed to succeed unless the program exceeds the /// specified size limit. If the size limit is exceeded, then compilation /// stops and returns an error. - pub fn compile(mut self, exprs: &[Hir]) -> result::Result { + pub fn compile( + mut self, + exprs: &[Hir], + ) -> result::Result, Error> { debug_assert!(!exprs.is_empty()); self.num_exprs = exprs.len(); if exprs.len() == 1 { @@ -124,7 +114,7 @@ impl Compiler { } } - fn compile_one(mut self, expr: &Hir) -> result::Result { + fn compile_one(mut self, expr: &Hir) -> result::Result, Error> { // If we're compiling a forward DFA and we aren't anchored, then // add a `.*?` before the first capture group. // Other matching engines handle this by baking the logic into the @@ -145,14 +135,14 @@ impl Compiler { } self.fill_to_next(patch.hole); self.compiled.matches = vec![self.insts.len()]; - self.push_compiled(Inst::Match(0)); + self.push_compiled(I::new_match(0)); self.compile_finish() } fn compile_many( mut self, exprs: &[Hir], - ) -> result::Result { + ) -> result::Result, Error> { debug_assert!(exprs.len() > 1); self.compiled.is_anchored_start = @@ -176,7 +166,7 @@ impl Compiler { self.c_capture(0, expr)?.unwrap_or(self.next_inst()); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); - self.push_compiled(Inst::Match(i)); + self.push_compiled(I::new_match(i)); prev_hole = self.fill_split(split, Some(entry), None); } let i = exprs.len() - 1; @@ -185,11 +175,11 @@ impl Compiler { self.fill(prev_hole, entry); self.fill_to_next(hole); self.compiled.matches.push(self.insts.len()); - self.push_compiled(Inst::Match(i)); + self.push_compiled(I::new_match(i)); self.compile_finish() } - fn compile_finish(mut self) -> result::Result { + fn compile_finish(mut self) -> result::Result, Error> { self.compiled.insts = self.insts.into_iter().map(|inst| inst.unwrap()).collect(); self.compiled.byte_classes = self.byte_classes.byte_classes(); @@ -474,9 +464,9 @@ impl Compiler { Ok(Some(Patch { hole: hole, entry: self.insts.len() - 1 })) } - fn c_concat<'a, I>(&mut self, exprs: I) -> ResultOrEmpty + fn c_concat<'a, E>(&mut self, exprs: E) -> ResultOrEmpty where - I: IntoIterator, + E: IntoIterator, { let mut exprs = exprs.into_iter(); let Patch { mut hole, entry } = loop { @@ -771,7 +761,7 @@ impl Compiler { } } - fn push_compiled(&mut self, inst: Inst) { + fn push_compiled(&mut self, inst: I) { self.insts.push(MaybeInst::Compiled(inst)); } @@ -795,7 +785,7 @@ impl Compiler { fn check_size(&self) -> result::Result<(), Error> { use std::mem::size_of; - if self.insts.len() * size_of::() > self.size_limit { + if self.insts.len() * size_of::() > self.size_limit { Err(Error::CompiledTooBig(self.size_limit)) } else { Ok(()) @@ -822,29 +812,31 @@ impl Hole { } #[derive(Clone, Debug)] -enum MaybeInst { - Compiled(Inst), +enum MaybeInst { + Compiled(I), Uncompiled(InstHole), Split, Split1(InstPtr), Split2(InstPtr), } -impl MaybeInst { +impl> MaybeInst { fn fill(&mut self, goto: InstPtr) { let maybeinst = match *self { MaybeInst::Split => MaybeInst::Split1(goto), - MaybeInst::Uncompiled(ref inst) => { - MaybeInst::Compiled(inst.fill(goto)) + MaybeInst::Uncompiled(ref mut inst) => { + // Replace by dummy `InstHole` + let inst = mem::replace(inst, InstHole::Save { slot: 0 }); + MaybeInst::Compiled((inst, goto).into()) } MaybeInst::Split1(goto1) => { - MaybeInst::Compiled(Inst::Split(InstSplit { + MaybeInst::Compiled(I::new_split(InstSplit { goto1: goto1, goto2: goto, })) } MaybeInst::Split2(goto2) => { - MaybeInst::Compiled(Inst::Split(InstSplit { + MaybeInst::Compiled(I::new_split(InstSplit { goto1: goto, goto2: goto2, })) @@ -861,7 +853,7 @@ impl MaybeInst { fn fill_split(&mut self, goto1: InstPtr, goto2: InstPtr) { let filled = match *self { MaybeInst::Split => { - Inst::Split(InstSplit { goto1: goto1, goto2: goto2 }) + I::new_split(InstSplit { goto1: goto1, goto2: goto2 }) } _ => unreachable!( "must be called on Split instruction, \ @@ -896,7 +888,7 @@ impl MaybeInst { *self = MaybeInst::Split2(half_filled); } - fn unwrap(self) -> Inst { + fn unwrap(self) -> I { match self { MaybeInst::Compiled(inst) => inst, _ => unreachable!( @@ -908,8 +900,10 @@ impl MaybeInst { } } +// TODO: Specialize `compile` into `compile_bytes` and `compile_unicode` +// to avoid making `InstHole` public? #[derive(Clone, Debug)] -enum InstHole { +pub enum InstHole { Save { slot: usize }, EmptyLook { look: EmptyLook }, Char { c: char }, @@ -917,32 +911,60 @@ enum InstHole { Bytes { start: u8, end: u8 }, } -impl InstHole { - fn fill(&self, goto: InstPtr) -> Inst { - match *self { +impl From<(InstHole, InstPtr)> for UnicodeInst { + fn from(val: (InstHole, InstPtr)) -> UnicodeInst { + let (hole, goto) = val; + match hole { InstHole::Save { slot } => { - Inst::Save(InstSave { goto: goto, slot: slot }) + UnicodeInst::Save(InstSave { goto: goto, slot: slot }) } InstHole::EmptyLook { look } => { - Inst::EmptyLook(InstEmptyLook { goto: goto, look: look }) + UnicodeInst::EmptyLook(InstEmptyLook { + goto: goto, + look: look, + }) + } + InstHole::Char { c } => { + UnicodeInst::Char(InstChar { goto: goto, c: c }) } - InstHole::Char { c } => Inst::Char(InstChar { goto: goto, c: c }), InstHole::Ranges { ref ranges } => { - Inst::Ranges(InstRanges { goto: goto, ranges: ranges.clone() }) + UnicodeInst::Ranges(InstRanges { + goto: goto, + ranges: ranges.clone(), + }) } - InstHole::Bytes { start, end } => { - Inst::Bytes(InstBytes { goto: goto, start: start, end: end }) + InstHole::Bytes { .. } => unreachable!(), + } + } +} + +impl From<(InstHole, InstPtr)> for BytesInst { + fn from(val: (InstHole, InstPtr)) -> BytesInst { + let (hole, goto) = val; + match hole { + InstHole::Save { slot } => { + BytesInst::Save(InstSave { goto: goto, slot: slot }) } + InstHole::EmptyLook { look } => { + BytesInst::EmptyLook(InstEmptyLook { goto: goto, look: look }) + } + InstHole::Char { .. } => unreachable!(), + InstHole::Ranges { .. } => unreachable!(), + InstHole::Bytes { start, end } => BytesInst::Bytes(InstBytes { + goto: goto, + start: start, + end: end, + }), } } } -struct CompileClass<'a, 'b> { - c: &'a mut Compiler, +struct CompileClass<'a, 'b, I: InstTrait> { + c: &'a mut Compiler, ranges: &'b [hir::ClassUnicodeRange], } -impl<'a, 'b> CompileClass<'a, 'b> { +impl<'a, 'b, I: InstTrait + From<(InstHole, usize)>> CompileClass<'a, 'b, I> { fn compile(mut self) -> Result { let mut holes = vec![]; let mut initial_entry = None; @@ -992,9 +1014,9 @@ impl<'a, 'b> CompileClass<'a, 'b> { } } - fn c_utf8_seq_<'r, I>(&mut self, seq: I) -> Result + fn c_utf8_seq_<'r, S>(&mut self, seq: S) -> Result where - I: IntoIterator, + S: IntoIterator, { // The initial instruction for each UTF-8 sequence should be the same. let mut from_inst = ::std::usize::MAX; @@ -1013,17 +1035,14 @@ impl<'a, 'b> CompileClass<'a, 'b> { } } self.c.byte_classes.set_range(byte_range.start, byte_range.end); + let inst_hole = InstHole::Bytes { + start: byte_range.start, + end: byte_range.end, + }; if from_inst == ::std::usize::MAX { - last_hole = self.c.push_hole(InstHole::Bytes { - start: byte_range.start, - end: byte_range.end, - }); + last_hole = self.c.push_hole(inst_hole); } else { - self.c.push_compiled(Inst::Bytes(InstBytes { - goto: from_inst, - start: byte_range.start, - end: byte_range.end, - })); + self.c.push_compiled((inst_hole, from_inst).into()); } from_inst = self.c.insts.len().checked_sub(1).unwrap(); debug_assert!(from_inst < ::std::usize::MAX); diff --git a/src/dfa.rs b/src/dfa.rs index 9ac0c2c39..523c2f615 100644 --- a/src/dfa.rs +++ b/src/dfa.rs @@ -43,7 +43,7 @@ use std::mem; use std::sync::Arc; use exec::ProgramCache; -use prog::{Inst, Program}; +use prog::{BytesInst, InstTrait, Program}; use sparse::SparseSet; /// Return true if and only if the given program can be executed by a DFA. @@ -54,8 +54,7 @@ use sparse::SparseSet; /// /// This function will also return false if the given program has any Unicode /// instructions (Char or Ranges) since the DFA operates on bytes only. -pub fn can_exec(insts: &Program) -> bool { - use prog::Inst::*; +pub fn can_exec(insts: &Program) -> bool { // If for some reason we manage to allocate a regex program with more // than i32::MAX instructions, then we can't execute the DFA because we // use 32 bit instruction pointer deltas for memory savings. @@ -65,12 +64,6 @@ pub fn can_exec(insts: &Program) -> bool { if insts.dfa_size_limit == 0 || insts.len() > ::std::i32::MAX as usize { return false; } - for inst in insts { - match *inst { - Char(_) | Ranges(_) => return false, - EmptyLook(_) | Match(_) | Save(_) | Split(_) | Bytes(_) => {} - } - } true } @@ -172,7 +165,7 @@ pub struct Fsm<'a> { /// the `dfa` instructions or the `dfa_reverse` instructions from /// `exec::ExecReadOnly`. (It never uses `ExecReadOnly.nfa`, which may have /// Unicode opcodes that cannot be executed by the DFA.) - prog: &'a Program, + prog: &'a Program, /// The start state. We record it here because the pointer may change /// when the cache is wiped. start: StatePtr, @@ -411,7 +404,7 @@ struct StateFlags(u8); impl Cache { /// Create new empty cache for the DFA engine. - pub fn new(prog: &Program) -> Self { + pub fn new(prog: &Program) -> Self { // We add 1 to account for the special EOF byte. let num_byte_classes = (prog.byte_classes[255] as usize + 1) + 1; let starts = vec![STATE_UNKNOWN; 256]; @@ -445,7 +438,7 @@ impl CacheInner { impl<'a> Fsm<'a> { #[cfg_attr(feature = "perf-inline", inline(always))] pub fn forward( - prog: &'a Program, + prog: &'a Program, cache: &ProgramCache, quit_after_match: bool, text: &[u8], @@ -475,7 +468,7 @@ impl<'a> Fsm<'a> { #[cfg_attr(feature = "perf-inline", inline(always))] pub fn reverse( - prog: &'a Program, + prog: &'a Program, cache: &ProgramCache, quit_after_match: bool, text: &[u8], @@ -505,7 +498,7 @@ impl<'a> Fsm<'a> { #[cfg_attr(feature = "perf-inline", inline(always))] pub fn forward_many( - prog: &'a Program, + prog: &'a Program, cache: &ProgramCache, matches: &mut [bool], text: &[u8], @@ -539,7 +532,7 @@ impl<'a> Fsm<'a> { debug_assert!(dfa.last_match_si != STATE_UNKNOWN); debug_assert!(dfa.last_match_si != STATE_DEAD); for ip in dfa.state(dfa.last_match_si).inst_ptrs() { - if let Inst::Match(slot) = dfa.prog[ip] { + if let BytesInst::Match(slot) = dfa.prog[ip] { matches[slot] = true; } } @@ -894,7 +887,7 @@ impl<'a> Fsm<'a> { mut si: StatePtr, b: Byte, ) -> Option { - use prog::Inst::*; + use prog::BytesInst::*; // Initialize a queue with the current DFA state's NFA states. qcur.clear(); @@ -957,8 +950,6 @@ impl<'a> Fsm<'a> { qnext.clear(); for &ip in &*qcur { match self.prog[ip as usize] { - // These states never happen in a byte-based program. - Char(_) | Ranges(_) => unreachable!(), // These states are handled when following epsilon transitions. Save(_) | Split(_) | EmptyLook(_) => {} Match(_) => { @@ -1056,8 +1047,8 @@ impl<'a> Fsm<'a> { q: &mut SparseSet, flags: EmptyFlags, ) { + use prog::BytesInst::*; use prog::EmptyLook::*; - use prog::Inst::*; // We need to traverse the NFA to follow epsilon transitions, so avoid // recursion with an explicit stack. @@ -1072,7 +1063,6 @@ impl<'a> Fsm<'a> { } q.insert(ip as usize); match self.prog[ip as usize] { - Char(_) | Ranges(_) => unreachable!(), Match(_) | Bytes(_) => { break; } @@ -1190,7 +1180,7 @@ impl<'a> Fsm<'a> { q: &SparseSet, state_flags: &mut StateFlags, ) -> Option { - use prog::Inst::*; + use prog::BytesInst::*; // We need to build up enough information to recognize pre-built states // in the DFA. Generally speaking, this includes every instruction @@ -1211,7 +1201,6 @@ impl<'a> Fsm<'a> { for &ip in q { let ip = usize_to_u32(ip); match self.prog[ip as usize] { - Char(_) | Ranges(_) => unreachable!(), Save(_) | Split(_) => {} Bytes(_) => push_inst_ptr(&mut insts, &mut prev, ip), EmptyLook(_) => { diff --git a/src/exec.rs b/src/exec.rs index 3d5a52bea..2348e357f 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -18,7 +18,7 @@ use input::{ByteInput, CharInput}; use literal::LiteralSearcher; use pikevm; use pool::{Pool, PoolGuard}; -use prog::Program; +use prog::{BytesInst, Program, UnicodeInst}; use re_builder::RegexOptions; use re_bytes; use re_set; @@ -61,6 +61,76 @@ pub struct ExecNoSync<'c> { #[derive(Debug)] pub struct ExecNoSyncStr<'c>(ExecNoSync<'c>); +#[derive(Debug)] +enum NfaProgram { + Bytes(Program), + Unicode(Program), +} + +impl NfaProgram { + #[inline] + fn len(&self) -> usize { + match self { + NfaProgram::Bytes(program) => program.len(), + NfaProgram::Unicode(program) => program.len(), + } + } + + #[inline] + fn is_empty(&self) -> bool { + match self { + NfaProgram::Bytes(program) => program.is_empty(), + NfaProgram::Unicode(program) => program.is_empty(), + } + } + + #[inline] + fn is_anchored_start(&self) -> bool { + match self { + NfaProgram::Bytes(program) => program.is_anchored_start, + NfaProgram::Unicode(program) => program.is_anchored_start, + } + } + + #[inline] + fn is_anchored_end(&self) -> bool { + match self { + NfaProgram::Bytes(program) => program.is_anchored_end, + NfaProgram::Unicode(program) => program.is_anchored_end, + } + } + + #[inline] + fn only_utf8(&self) -> bool { + match self { + NfaProgram::Bytes(program) => program.only_utf8, + NfaProgram::Unicode(program) => program.only_utf8, + } + } + + #[inline] + fn prefixes(&self) -> &LiteralSearcher { + match self { + NfaProgram::Bytes(program) => &program.prefixes, + NfaProgram::Unicode(program) => &program.prefixes, + } + } + + pub fn capture_name_idx(&self) -> &Arc> { + match self { + NfaProgram::Bytes(program) => &program.capture_name_idx, + NfaProgram::Unicode(program) => &program.capture_name_idx, + } + } + + pub fn captures(&self) -> &[Option] { + match self { + NfaProgram::Bytes(program) => &program.captures, + NfaProgram::Unicode(program) => &program.captures, + } + } +} + /// `ExecReadOnly` comprises all read only state for a regex. Namely, all such /// state is determined at compile time and never changes during search. #[derive(Debug)] @@ -72,17 +142,17 @@ struct ExecReadOnly { /// /// N.B. It is not possibly to make this byte-based from the public API. /// It is only used for testing byte based programs in the NFA simulations. - nfa: Program, + nfa: NfaProgram, /// A compiled byte based program for DFA execution. This is only used /// if a DFA can be executed. (Currently, only word boundary assertions are /// not supported.) Note that this program contains an embedded `.*?` /// preceding the first capture group, unless the regex is anchored at the /// beginning. - dfa: Program, + dfa: Program, /// The same as above, except the program is reversed (and there is no /// preceding `.*?`). This is used by the DFA to find the starting location /// of matches. - dfa_reverse: Program, + dfa_reverse: Program, /// A set of suffix literals extracted from the regex. /// /// Prefix literals are stored on the `Program`, since they are used inside @@ -302,7 +372,7 @@ impl ExecBuilder { if self.options.pats.is_empty() { let ro = Arc::new(ExecReadOnly { res: vec![], - nfa: Program::new(), + nfa: NfaProgram::Unicode(Program::new()), dfa: Program::new(), dfa_reverse: Program::new(), suffixes: LiteralSearcher::empty(), @@ -314,11 +384,6 @@ impl ExecBuilder { return Ok(Exec { ro: ro, pool }); } let parsed = self.parse()?; - let mut nfa = Compiler::new() - .size_limit(self.options.size_limit) - .bytes(self.bytes || parsed.bytes) - .only_utf8(self.only_utf8) - .compile(&parsed.exprs)?; let mut dfa = Compiler::new() .size_limit(self.options.size_limit) .dfa(true) @@ -333,8 +398,25 @@ impl ExecBuilder { #[cfg(feature = "perf-literal")] let ac = self.build_aho_corasick(&parsed); - nfa.prefixes = LiteralSearcher::prefixes(parsed.prefixes); - dfa.prefixes = nfa.prefixes.clone(); + + let nfa = if self.bytes || parsed.bytes { + let mut program = Compiler::new() + .size_limit(self.options.size_limit) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + program.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = program.prefixes.clone(); + NfaProgram::Bytes(program) + } else { + let mut program = Compiler::new() + .size_limit(self.options.size_limit) + .only_utf8(self.only_utf8) + .compile(&parsed.exprs)?; + program.prefixes = LiteralSearcher::prefixes(parsed.prefixes); + dfa.prefixes = program.prefixes.clone(); + NfaProgram::Unicode(program) + }; + dfa.dfa_size_limit = self.options.dfa_size_limit; dfa_reverse.dfa_size_limit = self.options.dfa_size_limit; @@ -428,7 +510,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { /// are two slots for every capture group, corresponding to possibly empty /// start and end locations of the capture.) fn slots_len(&self) -> usize { - self.ro.nfa.captures.len() * 2 + self.ro.nfa.captures().len() * 2 } fn next_after_empty(&self, _text: &[u8], i: usize) -> usize { @@ -630,7 +712,7 @@ impl<'c> RegularExpression for ExecNoSync<'c> { } #[cfg(feature = "perf-dfa")] MatchType::Dfa => { - if self.ro.nfa.is_anchored_start { + if self.ro.nfa.is_anchored_start() { self.captures_nfa(slots, text, start) } else { match self.find_dfa_forward(text, start) { @@ -701,12 +783,12 @@ impl<'c> ExecNoSync<'c> { use self::MatchLiteralType::*; match ty { Unanchored => { - let lits = &self.ro.nfa.prefixes; + let lits = self.ro.nfa.prefixes(); lits.find(&text[start..]).map(|(s, e)| (start + s, start + e)) } AnchoredStart => { - let lits = &self.ro.nfa.prefixes; - if start == 0 || !self.ro.nfa.is_anchored_start { + let lits = self.ro.nfa.prefixes(); + if start == 0 || !self.ro.nfa.is_anchored_start() { lits.find_start(&text[start..]) .map(|(s, e)| (start + s, start + e)) } else { @@ -1086,20 +1168,19 @@ impl<'c> ExecNoSync<'c> { start: usize, end: usize, ) -> bool { - if self.ro.nfa.uses_bytes() { - pikevm::Fsm::exec( - &self.ro.nfa, + match self.ro.nfa { + NfaProgram::Bytes(ref program) => pikevm::Fsm::exec( + program, self.cache.value(), matches, slots, quit_after_match, - ByteInput::new(text, self.ro.nfa.only_utf8), + ByteInput::new(text, self.ro.nfa.only_utf8()), start, end, - ) - } else { - pikevm::Fsm::exec( - &self.ro.nfa, + ), + NfaProgram::Unicode(ref program) => pikevm::Fsm::exec( + program, self.cache.value(), matches, slots, @@ -1107,7 +1188,7 @@ impl<'c> ExecNoSync<'c> { CharInput::new(text), start, end, - ) + ), } } @@ -1120,26 +1201,25 @@ impl<'c> ExecNoSync<'c> { start: usize, end: usize, ) -> bool { - if self.ro.nfa.uses_bytes() { - backtrack::Bounded::exec( - &self.ro.nfa, + match self.ro.nfa { + NfaProgram::Bytes(ref program) => backtrack::Bounded::exec( + program, self.cache.value(), matches, slots, - ByteInput::new(text, self.ro.nfa.only_utf8), + ByteInput::new(text, self.ro.nfa.only_utf8()), start, end, - ) - } else { - backtrack::Bounded::exec( - &self.ro.nfa, + ), + NfaProgram::Unicode(ref program) => backtrack::Bounded::exec( + program, self.cache.value(), matches, slots, CharInput::new(text), start, end, - ) + ), } } @@ -1237,7 +1317,7 @@ impl<'c> ExecNoSync<'c> { #[cfg(feature = "perf-literal")] fn imp(ro: &ExecReadOnly, text: &[u8]) -> bool { // Only do this check if the haystack is big (>1MB). - if text.len() > (1 << 20) && ro.nfa.is_anchored_end { + if text.len() > (1 << 20) && ro.nfa.is_anchored_end() { let lcs = ro.suffixes.lcs(); if lcs.len() >= 1 && !lcs.is_suffix(text) { return false; @@ -1250,7 +1330,7 @@ impl<'c> ExecNoSync<'c> { } pub fn capture_name_idx(&self) -> &Arc> { - &self.ro.nfa.capture_name_idx + &self.ro.nfa.capture_name_idx() } } @@ -1306,13 +1386,13 @@ impl Exec { /// /// Any capture that isn't named is None. pub fn capture_names(&self) -> &[Option] { - &self.ro.nfa.captures + &self.ro.nfa.captures() } /// Return a reference to named groups mapping (from group name to /// group position). pub fn capture_name_idx(&self) -> &Arc> { - &self.ro.nfa.capture_name_idx + &self.ro.nfa.capture_name_idx() } } @@ -1329,7 +1409,7 @@ impl ExecReadOnly { return hint.unwrap(); } // If the NFA is empty, then we'll never match anything. - if self.nfa.insts.is_empty() { + if self.nfa.is_empty() { return MatchType::Nothing; } if let Some(literalty) = self.choose_literal_match_type() { @@ -1371,15 +1451,15 @@ impl ExecReadOnly { MatchLiteralType::AhoCorasick, )); } - if ro.nfa.prefixes.complete() { - return if ro.nfa.is_anchored_start { + if ro.nfa.prefixes().complete() { + return if ro.nfa.is_anchored_start() { Some(MatchType::Literal(MatchLiteralType::AnchoredStart)) } else { Some(MatchType::Literal(MatchLiteralType::Unanchored)) }; } if ro.suffixes.complete() { - return if ro.nfa.is_anchored_end { + return if ro.nfa.is_anchored_end() { Some(MatchType::Literal(MatchLiteralType::AnchoredEnd)) } else { // This case shouldn't happen. When the regex isn't @@ -1412,7 +1492,7 @@ impl ExecReadOnly { } // If the regex is anchored at the end but not the start, then // just match in reverse from the end of the haystack. - if !ro.nfa.is_anchored_start && ro.nfa.is_anchored_end { + if !ro.nfa.is_anchored_start() && ro.nfa.is_anchored_end() { return Some(MatchType::DfaAnchoredReverse); } #[cfg(feature = "perf-literal")] @@ -1536,8 +1616,20 @@ pub struct ProgramCacheInner { impl ProgramCacheInner { fn new(ro: &ExecReadOnly) -> Self { ProgramCacheInner { - pikevm: pikevm::Cache::new(&ro.nfa), - backtrack: backtrack::Cache::new(&ro.nfa), + pikevm: match ro.nfa { + NfaProgram::Bytes(ref program) => pikevm::Cache::new(program), + NfaProgram::Unicode(ref program) => { + pikevm::Cache::new(program) + } + }, + backtrack: match ro.nfa { + NfaProgram::Bytes(ref program) => { + backtrack::Cache::new(program) + } + NfaProgram::Unicode(ref program) => { + backtrack::Cache::new(program) + } + }, #[cfg(feature = "perf-dfa")] dfa: dfa::Cache::new(&ro.dfa), #[cfg(feature = "perf-dfa")] diff --git a/src/lib.rs b/src/lib.rs index 357ac0dd0..15a63f204 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -781,5 +781,5 @@ pub mod internal { pub use exec::{Exec, ExecBuilder}; pub use input::{Char, CharInput, Input, InputAt}; pub use literal::LiteralSearcher; - pub use prog::{EmptyLook, Inst, InstRanges, Program}; + pub use prog::{BytesInst, EmptyLook, InstRanges, Program, UnicodeInst}; } diff --git a/src/pikevm.rs b/src/pikevm.rs index 299087da8..f6cd88be7 100644 --- a/src/pikevm.rs +++ b/src/pikevm.rs @@ -19,17 +19,17 @@ use std::mem; use exec::ProgramCache; use input::{Input, InputAt}; -use prog::{InstPtr, Program}; +use prog::{BytesInst, InstPtr, InstTrait, Program, UnicodeInst}; use re_trait::Slot; use sparse::SparseSet; /// An NFA simulation matching engine. #[derive(Debug)] -pub struct Fsm<'r, I> { +pub struct Fsm<'r, I, P: InstTrait> { /// The sequence of opcodes (among other things) that is actually executed. /// /// The program may be byte oriented or Unicode codepoint oriented. - prog: &'r Program, + prog: &'r Program

, /// An explicit stack used for following epsilon transitions. (This is /// borrowed from the cache.) stack: &'r mut Vec, @@ -49,7 +49,7 @@ pub struct Cache { /// An ordered set of NFA states and their captures. #[derive(Clone, Debug)] -struct Threads { +pub struct Threads { /// An ordered set of opcodes (each opcode is an NFA state). set: SparseSet, /// Captures for every NFA state. @@ -75,18 +75,18 @@ enum FollowEpsilon { impl Cache { /// Create a new allocation used by the NFA machine to record execution /// and captures. - pub fn new(_prog: &Program) -> Self { + pub fn new(_prog: &Program) -> Self { Cache { clist: Threads::new(), nlist: Threads::new(), stack: vec![] } } } -impl<'r, I: Input> Fsm<'r, I> { +impl<'r, I: Input, P: InstTrait + Step> Fsm<'r, I, P> { /// Execute the NFA matching engine. /// /// If there's a match, `exec` returns `true` and populates the given /// captures accordingly. pub fn exec( - prog: &'r Program, + prog: &'r Program

, cache: &ProgramCache, matches: &mut [bool], slots: &mut [Slot], @@ -231,39 +231,15 @@ impl<'r, I: Input> Fsm<'r, I> { at: InputAt, at_next: InputAt, ) -> bool { - use prog::Inst::*; - match self.prog[ip] { - Match(match_slot) => { - if match_slot < matches.len() { - matches[match_slot] = true; - } - for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { - *slot = *val; - } - true - } - Char(ref inst) => { - if inst.c == at.char() { - self.add(nlist, thread_caps, inst.goto, at_next); - } - false - } - Ranges(ref inst) => { - if inst.matches(at.char()) { - self.add(nlist, thread_caps, inst.goto, at_next); - } - false - } - Bytes(ref inst) => { - if let Some(b) = at.byte() { - if inst.matches(b) { - self.add(nlist, thread_caps, inst.goto, at_next); - } - } - false - } - EmptyLook(_) | Save(_) | Split(_) => false, - } + self.prog[ip].step( + self, + nlist, + matches, + slots, + thread_caps, + at, + at_next, + ) } /// Follows epsilon transitions and adds them for processing to nlist, @@ -300,40 +276,196 @@ impl<'r, I: Input> Fsm<'r, I> { // traverse the set of states. We only push to the stack when we // absolutely need recursion (restoring captures or following a // branch). - use prog::Inst::*; loop { // Don't visit states we've already added. if nlist.set.contains(ip) { return; } nlist.set.insert(ip); - match self.prog[ip] { - EmptyLook(ref inst) => { - if self.input.is_empty_match(at, inst) { - ip = inst.goto; - } + if let Some(next_ip) = + self.prog[ip].add_step(self, nlist, thread_caps, ip, at) + { + ip = next_ip; + } else { + return; + } + } + } +} + +pub trait Step: InstTrait + Sized { + fn step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, Self>, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option], + at: InputAt, + at_next: InputAt, + ) -> bool; + + fn add_step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, Self>, + nlist: &mut Threads, + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + ) -> Option; +} + +impl Step for UnicodeInst { + fn step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, UnicodeInst>, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option], + at: InputAt, + at_next: InputAt, + ) -> bool { + use prog::UnicodeInst::*; + match *self { + Match(match_slot) => { + if match_slot < matches.len() { + matches[match_slot] = true; } - Save(ref inst) => { - if inst.slot < thread_caps.len() { - self.stack.push(FollowEpsilon::Capture { - slot: inst.slot, - pos: thread_caps[inst.slot], - }); - thread_caps[inst.slot] = Some(at.pos()); - } - ip = inst.goto; + for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { + *slot = *val; } - Split(ref inst) => { - self.stack.push(FollowEpsilon::IP(inst.goto2)); - ip = inst.goto1; + true + } + Char(ref inst) => { + if inst.c == at.char() { + fsm.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + Ranges(ref inst) => { + if inst.matches(at.char()) { + fsm.add(nlist, thread_caps, inst.goto, at_next); + } + false + } + EmptyLook(_) | Save(_) | Split(_) => false, + } + } + + fn add_step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, UnicodeInst>, + nlist: &mut Threads, + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + ) -> Option { + use prog::UnicodeInst::*; + match *self { + EmptyLook(ref inst) => { + if fsm.input.is_empty_match(at, inst) { + Some(inst.goto) + } else { + Some(ip) + } + } + Save(ref inst) => { + if inst.slot < thread_caps.len() { + fsm.stack.push(FollowEpsilon::Capture { + slot: inst.slot, + pos: thread_caps[inst.slot], + }); + thread_caps[inst.slot] = Some(at.pos()); + } + Some(inst.goto) + } + Split(ref inst) => { + fsm.stack.push(FollowEpsilon::IP(inst.goto2)); + Some(inst.goto1) + } + Match(_) | Char(_) | Ranges(_) => { + let t = &mut nlist.caps(ip); + for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + None + } + } + } +} + +impl Step for BytesInst { + fn step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, BytesInst>, + nlist: &mut Threads, + matches: &mut [bool], + slots: &mut [Slot], + thread_caps: &mut [Option], + at: InputAt, + at_next: InputAt, + ) -> bool { + use prog::BytesInst::*; + match *self { + Match(match_slot) => { + if match_slot < matches.len() { + matches[match_slot] = true; } - Match(_) | Char(_) | Ranges(_) | Bytes(_) => { - let t = &mut nlist.caps(ip); - for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { - *slot = *val; + for (slot, val) in slots.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + true + } + Bytes(ref inst) => { + if let Some(b) = at.byte() { + if inst.matches(b) { + fsm.add(nlist, thread_caps, inst.goto, at_next); } - return; } + false + } + EmptyLook(_) | Save(_) | Split(_) => false, + } + } + + fn add_step<'r, I: Input>( + &self, + fsm: &mut Fsm<'r, I, BytesInst>, + nlist: &mut Threads, + thread_caps: &mut [Option], + ip: usize, + at: InputAt, + ) -> Option { + use prog::BytesInst::*; + match *self { + EmptyLook(ref inst) => { + if fsm.input.is_empty_match(at, inst) { + Some(inst.goto) + } else { + Some(ip) + } + } + Save(ref inst) => { + if inst.slot < thread_caps.len() { + fsm.stack.push(FollowEpsilon::Capture { + slot: inst.slot, + pos: thread_caps[inst.slot], + }); + thread_caps[inst.slot] = Some(at.pos()); + } + Some(inst.goto) + } + Split(ref inst) => { + fsm.stack.push(FollowEpsilon::IP(inst.goto2)); + Some(inst.goto1) + } + Match(_) | Bytes(_) => { + let t = &mut nlist.caps(ip); + for (slot, val) in t.iter_mut().zip(thread_caps.iter()) { + *slot = *val; + } + None } } } diff --git a/src/prog.rs b/src/prog.rs index 74e5f2f6f..640b71b38 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -15,9 +15,9 @@ pub type InstPtr = usize; /// Program is a sequence of instructions and various facts about thos /// instructions. #[derive(Clone)] -pub struct Program { +pub struct Program { /// A sequence of instructions that represents an NFA. - pub insts: Vec, + pub insts: Vec, /// Pointers to each Match instruction in the sequence. /// /// This is always length 1 unless this program represents a regex set. @@ -38,9 +38,6 @@ pub struct Program { pub byte_classes: Vec, /// When true, this program can only match valid UTF-8. pub only_utf8: bool, - /// When true, this program uses byte range instructions instead of Unicode - /// range instructions. - pub is_bytes: bool, /// When true, the program is compiled for DFA matching. For example, this /// implies `is_bytes` and also inserts a preceding `.*?` for unanchored /// regexes. @@ -74,7 +71,7 @@ pub struct Program { pub dfa_size_limit: usize, } -impl Program { +impl Program { /// Creates an empty instruction sequence. Fields are given default /// values. pub fn new() -> Self { @@ -86,7 +83,6 @@ impl Program { start: 0, byte_classes: vec![0; 256], only_utf8: true, - is_bytes: false, is_dfa: false, is_reverse: false, is_anchored_start: false, @@ -101,9 +97,9 @@ impl Program { /// next pc that is not a no-op instruction. pub fn skip(&self, mut pc: usize) -> usize { loop { - match self[pc] { - Inst::Save(ref i) => pc = i.goto, - _ => return pc, + match self[pc].save_goto() { + Some(goto) => pc = goto, + None => return pc, } } } @@ -117,10 +113,7 @@ impl Program { // meaningless. return false; } - match self[self.skip(pc)] { - Inst::Match(_) => true, - _ => false, - } + self[self.skip(pc)].is_match() } /// Returns true if the current configuration demands that an implicit @@ -132,7 +125,7 @@ impl Program { /// Returns true if this program uses Byte instructions instead of /// Char/Range instructions. pub fn uses_bytes(&self) -> bool { - self.is_bytes || self.is_dfa + I::IS_BYTES || self.is_dfa } /// Returns true if this program exclusively matches valid UTF-8 bytes. @@ -148,7 +141,7 @@ impl Program { // The only instruction that uses heap space is Ranges (for // Unicode codepoint programs) to store non-overlapping codepoint // ranges. To keep this operation constant time, we ignore them. - (self.len() * mem::size_of::()) + (self.len() * mem::size_of::()) + (self.matches.len() * mem::size_of::()) + (self.captures.len() * mem::size_of::>()) + (self.capture_name_idx.len() @@ -158,8 +151,8 @@ impl Program { } } -impl Deref for Program { - type Target = [Inst]; +impl Deref for Program { + type Target = [I]; #[cfg_attr(feature = "perf-inline", inline(always))] fn deref(&self) -> &Self::Target { @@ -167,67 +160,13 @@ impl Deref for Program { } } -impl fmt::Debug for Program { +impl fmt::Debug for Program { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - use self::Inst::*; - - fn with_goto(cur: usize, goto: usize, fmtd: String) -> String { - if goto == cur + 1 { - fmtd - } else { - format!("{} (goto: {})", fmtd, goto) - } - } - - fn visible_byte(b: u8) -> String { - use std::ascii::escape_default; - let escaped = escape_default(b).collect::>(); - String::from_utf8_lossy(&escaped).into_owned() - } - for (pc, inst) in self.iter().enumerate() { - match *inst { - Match(slot) => write!(f, "{:04} Match({:?})", pc, slot)?, - Save(ref inst) => { - let s = format!("{:04} Save({})", pc, inst.slot); - write!(f, "{}", with_goto(pc, inst.goto, s))?; - } - Split(ref inst) => { - write!( - f, - "{:04} Split({}, {})", - pc, inst.goto1, inst.goto2 - )?; - } - EmptyLook(ref inst) => { - let s = format!("{:?}", inst.look); - write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; - } - Char(ref inst) => { - let s = format!("{:?}", inst.c); - write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; - } - Ranges(ref inst) => { - let ranges = inst - .ranges - .iter() - .map(|r| format!("{:?}-{:?}", r.0, r.1)) - .collect::>() - .join(", "); - write!( - f, - "{:04} {}", - pc, - with_goto(pc, inst.goto, ranges) - )?; - } - Bytes(ref inst) => { - let s = format!( - "Bytes({}, {})", - visible_byte(inst.start), - visible_byte(inst.end) - ); - write!(f, "{:04} {}", pc, with_goto(pc, inst.goto, s))?; + write!(f, "{:04} {:?}", pc, inst)?; + if let Some(goto) = inst.goto() { + if pc + 1 == goto { + write!(f, " (goto: {})", goto)?; } } if pc == self.start { @@ -239,33 +178,34 @@ impl fmt::Debug for Program { } } -impl<'a> IntoIterator for &'a Program { - type Item = &'a Inst; - type IntoIter = slice::Iter<'a, Inst>; +impl<'a, I: InstTrait> IntoIterator for &'a Program { + type Item = &'a I; + type IntoIter = slice::Iter<'a, I>; fn into_iter(self) -> Self::IntoIter { self.iter() } } -/// Inst is an instruction code in a Regex program. +/// `InstTrait` represents an instruction code in a Regex program. /// /// Regrettably, a regex program either contains Unicode codepoint -/// instructions (Char and Ranges) or it contains byte instructions (Bytes). +/// instructions (Char and Ranges: [`UnicodeInst`]) or it contains +/// byte instructions (Bytes: [`BytesInst`]). /// A regex program can never contain both. -/// -/// It would be worth investigating splitting this into two distinct types and -/// then figuring out how to make the matching engines polymorphic over those -/// types without sacrificing performance. -/// -/// Other than the benefit of moving invariants into the type system, another -/// benefit is the decreased size. If we remove the `Char` and `Ranges` -/// instructions from the `Inst` enum, then its size shrinks from 40 bytes to -/// 24 bytes. (This is because of the removal of a `Vec` in the `Ranges` -/// variant.) Given that byte based machines are typically much bigger than -/// their Unicode analogues (because they can decode UTF-8 directly), this ends -/// up being a pretty significant savings. -#[derive(Clone, Debug)] -pub enum Inst { +pub trait InstTrait: fmt::Debug { + const IS_BYTES: bool; + + /// Returns true if and only if this is a match instruction. + fn is_match(&self) -> bool; + fn goto(&self) -> Option; + fn save_goto(&self) -> Option; + fn new_match(i: usize) -> Self; + fn new_split(split: InstSplit) -> Self; +} + +/// A Unicode codepoint instruction. +#[derive(Clone)] +pub enum UnicodeInst { /// Match indicates that the program has reached a match state. /// /// The number in the match corresponds to the Nth logical regular @@ -289,20 +229,173 @@ pub enum Inst { /// Ranges requires the regex program to match the character at the current /// position in the input with one of the ranges specified in InstRanges. Ranges(InstRanges), +} + +impl InstTrait for UnicodeInst { + const IS_BYTES: bool = false; + + #[inline] + fn is_match(&self) -> bool { + match *self { + Self::Match(_) => true, + _ => false, + } + } + + #[inline] + fn goto(&self) -> Option { + match self { + Self::Match(_) => None, + Self::Save(ref inst) => Some(inst.goto), + Self::Split(_) => None, + Self::EmptyLook(ref inst) => Some(inst.goto), + Self::Char(ref inst) => Some(inst.goto), + Self::Ranges(ref inst) => Some(inst.goto), + } + } + + #[inline] + fn save_goto(&self) -> Option { + match self { + Self::Save(ref inst) => Some(inst.goto), + _ => None, + } + } + + #[inline] + fn new_match(i: usize) -> Self { + Self::Match(i) + } + + #[inline] + fn new_split(split: InstSplit) -> Self { + Self::Split(split) + } +} + +impl fmt::Debug for UnicodeInst { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::Match(slot) => write!(f, "Match({:?})", slot), + Self::Save(ref inst) => write!(f, "Save({})", inst.slot), + Self::Split(ref inst) => { + write!(f, "Split({}, {})", inst.goto1, inst.goto2) + } + Self::EmptyLook(ref inst) => { + write!(f, "{:?}", inst.look) + } + Self::Char(ref inst) => { + write!(f, "{:?}", inst.c) + } + Self::Ranges(ref inst) => { + write!( + f, + "{}", + inst.ranges + .iter() + .map(|r| format!("{:?}-{:?}", r.0, r.1)) + .collect::>() + .join(", ") + ) + } + } + } +} + +/// A byte instruction. +#[derive(Clone)] +pub enum BytesInst { + /// Match indicates that the program has reached a match state. + /// + /// The number in the match corresponds to the Nth logical regular + /// expression in this program. This index is always 0 for normal regex + /// programs. Values greater than 0 appear when compiling regex sets, and + /// each match instruction gets its own unique value. The value corresponds + /// to the Nth regex in the set. + Match(usize), + /// Save causes the program to save the current location of the input in + /// the slot indicated by InstSave. + Save(InstSave), + /// Split causes the program to diverge to one of two paths in the + /// program, preferring goto1 in InstSplit. + Split(InstSplit), + /// EmptyLook represents a zero-width assertion in a regex program. A + /// zero-width assertion does not consume any of the input text. + EmptyLook(InstEmptyLook), /// Bytes is like Ranges, except it expresses a single byte range. It is /// used in conjunction with Split instructions to implement multi-byte /// character classes. Bytes(InstBytes), } -impl Inst { - /// Returns true if and only if this is a match instruction. - pub fn is_match(&self) -> bool { +impl InstTrait for BytesInst { + const IS_BYTES: bool = true; + + #[inline] + fn is_match(&self) -> bool { match *self { - Inst::Match(_) => true, + Self::Match(_) => true, _ => false, } } + + #[inline] + fn goto(&self) -> Option { + match self { + Self::Match(_) => None, + Self::Save(ref inst) => Some(inst.goto), + Self::Split(_) => None, + Self::EmptyLook(ref inst) => Some(inst.goto), + Self::Bytes(ref inst) => Some(inst.goto), + } + } + + #[inline] + fn save_goto(&self) -> Option { + match self { + Self::Save(ref inst) => Some(inst.goto), + _ => None, + } + } + + #[inline] + fn new_match(i: usize) -> Self { + Self::Match(i) + } + + #[inline] + fn new_split(split: InstSplit) -> Self { + Self::Split(split) + } +} + +impl fmt::Debug for BytesInst { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn visible_byte(b: u8) -> String { + use std::ascii::escape_default; + let escaped = escape_default(b).collect::>(); + String::from_utf8_lossy(&escaped).into_owned() + } + + match self { + Self::Match(slot) => write!(f, "Match({:?})", slot), + Self::Save(ref inst) => write!(f, "Save({})", inst.slot), + Self::Split(ref inst) => { + write!(f, "Split({}, {})", inst.goto1, inst.goto2) + } + Self::EmptyLook(ref inst) => { + write!(f, "{:?}", inst.look) + } + Self::Bytes(ref inst) => { + write!( + f, + "Bytes({}, {})", + visible_byte(inst.start), + visible_byte(inst.end) + ) + } + } + } } /// Representation of the Save instruction. @@ -432,3 +525,17 @@ impl InstBytes { self.start <= byte && byte <= self.end } } + +#[cfg(test)] +mod test { + #[test] + #[cfg(target_pointer_width = "64")] + fn test_size_of_inst() { + use std::mem::size_of; + + use super::{BytesInst, UnicodeInst}; + + assert_eq!(24, size_of::()); + assert_eq!(40, size_of::()); + } +}