From 31cdd442961ef56afb4203f365f80e7e28e6e2a6 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Sat, 24 Sep 2022 16:02:00 +0200 Subject: [PATCH 1/3] Fixed typo in src/prog.rs. --- src/prog.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prog.rs b/src/prog.rs index c211f71d8..1775cf325 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -12,7 +12,7 @@ use crate::literal::LiteralSearcher; /// `InstPtr` represents the index of an instruction in a regex program. pub type InstPtr = usize; -/// Program is a sequence of instructions and various facts about thos +/// Program is a sequence of instructions and various facts about those /// instructions. #[derive(Clone)] pub struct Program { From 7874e124c4b66fa6d597953f166b9b39d05043d6 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Sat, 24 Sep 2022 17:07:09 +0200 Subject: [PATCH 2/3] Added Regex::participating_captures_len. --- regex-syntax/src/hir/mod.rs | 54 ++++++++++++++++++++++++++++++++++++- src/compile.rs | 1 + src/exec.rs | 9 ++++++- src/prog.rs | 4 +++ src/re_bytes.rs | 7 +++++ src/re_unicode.rs | 7 +++++ tests/api.rs | 38 ++++++++++++++++++++++++++ 7 files changed, 118 insertions(+), 2 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 1096e9f05..8c5e4ba9d 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -243,6 +243,7 @@ impl Hir { info.set_match_empty(true); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = Some(0); Hir { kind: HirKind::Empty, info } } @@ -268,6 +269,7 @@ impl Hir { info.set_match_empty(false); info.set_literal(true); info.set_alternation_literal(true); + info.static_capture_count = Some(0); Hir { kind: HirKind::Literal(lit), info } } @@ -285,6 +287,7 @@ impl Hir { info.set_match_empty(false); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = Some(0); Hir { kind: HirKind::Class(class), info } } @@ -318,6 +321,7 @@ impl Hir { if let Anchor::EndLine = anchor { info.set_line_anchored_end(true); } + info.static_capture_count = Some(0); Hir { kind: HirKind::Anchor(anchor), info } } @@ -345,6 +349,7 @@ impl Hir { if let WordBoundary::AsciiNegate = word_boundary { info.set_always_utf8(false); } + info.static_capture_count = Some(0); Hir { kind: HirKind::WordBoundary(word_boundary), info } } @@ -372,6 +377,28 @@ impl Hir { info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = { + use RepetitionRange::{AtLeast, Bounded, Exactly}; + match (&rep.kind, rep.hir.info.static_capture_count) { + // Always zero. + (_, Some(0)) => Some(0), + (RepetitionKind::Range(Exactly(0)), _) => Some(0), + (RepetitionKind::Range(Bounded(_, 0)), _) => Some(0), + + // Impossible to know statically. + (_, None) => None, + (RepetitionKind::ZeroOrOne, _) => None, + (RepetitionKind::ZeroOrMore, _) => None, + (RepetitionKind::Range(AtLeast(0)), _) => None, + (RepetitionKind::Range(Bounded(0, _)), _) => None, + + // Guaranteed to be static. + (RepetitionKind::OneOrMore, Some(n)) => Some(n), + (RepetitionKind::Range(Exactly(_)), Some(n)) => Some(n), + (RepetitionKind::Range(AtLeast(_)), Some(n)) => Some(n), + (RepetitionKind::Range(Bounded(_, _)), Some(n)) => Some(n), + } + }; Hir { kind: HirKind::Repetition(rep), info } } @@ -389,6 +416,12 @@ impl Hir { info.set_match_empty(group.hir.is_match_empty()); info.set_literal(false); info.set_alternation_literal(false); + info.static_capture_count = match group.kind { + GroupKind::NonCapturing { .. } => { + group.hir.info.static_capture_count + } + _ => group.hir.info.static_capture_count.map(|n| n + 1), + }; Hir { kind: HirKind::Group(group), info } } @@ -480,6 +513,10 @@ impl Hir { }) .any(|e| e.is_line_anchored_end()), ); + info.static_capture_count = + exprs.iter().fold(Some(0), |cnt, e| { + Some(cnt? + e.info.static_capture_count?) + }); Hir { kind: HirKind::Concat(exprs), info } } } @@ -542,6 +579,9 @@ impl Hir { let x = info.is_alternation_literal() && e.is_literal(); info.set_alternation_literal(x); } + let mut capture_counts = exprs.iter().map(|e| e.info.static_capture_count); + let first = capture_counts.next().unwrap_or(Some(0)); + info.static_capture_count = capture_counts.fold(first, |a, b| if a == b { a } else { None }); Hir { kind: HirKind::Alternation(exprs), info } } } @@ -692,6 +732,13 @@ impl Hir { pub fn is_alternation_literal(&self) -> bool { self.info.is_alternation_literal() } + + /// Returns the number of captures groups that would participate in a + /// successful match of this expression. If this number can not be + /// statically determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.info.static_capture_count.map(|c| c as usize) + } } impl HirKind { @@ -1484,6 +1531,11 @@ struct HirInfo { /// If more attributes need to be added, it is OK to increase the size of /// this as appropriate. bools: u16, + + /// How many capture groups this HIR expression deterministically fills. + /// If this number could depend on the input (e.g. an Alternation where the + /// two branches have a different number of capture groups), this is None. + static_capture_count: Option, } // A simple macro for defining bitfield accessors/mutators. @@ -1505,7 +1557,7 @@ macro_rules! define_bool { impl HirInfo { fn new() -> HirInfo { - HirInfo { bools: 0 } + HirInfo { bools: 0, static_capture_count: None } } define_bool!(0, is_always_utf8, set_always_utf8); diff --git a/src/compile.rs b/src/compile.rs index 90ca25015..9aa7e3d29 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -149,6 +149,7 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; + self.compiled.participating_captures_len = expr.participating_captures_len(); let patch = self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); if self.compiled.needs_dotstar() { diff --git a/src/exec.rs b/src/exec.rs index e75ca083a..51cbd05a3 100644 --- a/src/exec.rs +++ b/src/exec.rs @@ -70,7 +70,7 @@ struct ExecReadOnly { /// A compiled program that is used in the NFA simulation and backtracking. /// It can be byte-based or Unicode codepoint based. /// - /// N.B. It is not possibly to make this byte-based from the public API. + /// N.B. It is not possible to make this byte-based from the public API. /// It is only used for testing byte based programs in the NFA simulations. nfa: Program, /// A compiled byte based program for DFA execution. This is only used @@ -1311,6 +1311,13 @@ impl Exec { pub fn capture_name_idx(&self) -> &Arc> { &self.ro.nfa.capture_name_idx } + + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.ro.nfa.participating_captures_len + } } impl Clone for Exec { diff --git a/src/prog.rs b/src/prog.rs index 1775cf325..c29bd8458 100644 --- a/src/prog.rs +++ b/src/prog.rs @@ -25,6 +25,9 @@ pub struct Program { /// The ordered sequence of all capture groups extracted from the AST. /// Unnamed groups are `None`. pub captures: Vec>, + /// The number of capture groups that participate in a successful match. + /// None if this can't be determined statically at compile time. + pub participating_captures_len: Option, /// Pointers to all named capture groups into `captures`. pub capture_name_idx: Arc>, /// A pointer to the start instruction. This can vary depending on how @@ -82,6 +85,7 @@ impl Program { insts: vec![], matches: vec![], captures: vec![], + participating_captures_len: None, capture_name_idx: Arc::new(HashMap::new()), start: 0, byte_classes: vec![0; 256], diff --git a/src/re_bytes.rs b/src/re_bytes.rs index d71969257..4983f2f88 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -666,6 +666,13 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } + + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.0.participating_captures_len() + } /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index 60d81a7d9..b6e1f7b3e 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -724,6 +724,13 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } + + /// Returns the number of participating captures that this regex will + /// return on a successful match. If this number can not be statically + /// determined from the regex this function returns `None`. + pub fn participating_captures_len(&self) -> Option { + self.0.participating_captures_len() + } /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. diff --git a/tests/api.rs b/tests/api.rs index c7250a8a3..6cd9a4b22 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -128,6 +128,44 @@ fn capture_index_lifetime() { assert_eq!(3, inner("123")); } +#[test] +fn participating_captures_len() { + let tests = [ + ("", Some(0)), + ("foo|bar", Some(0)), + ("(foo)|bar", None), + ("foo|(bar)", None), + ("(foo|bar)", Some(1)), + ("(a|b|c|d|e|f)", Some(1)), + ("(a)|(b)|(c)|(d)|(e)|(f)", Some(1)), + ("(a)(b)|(c)(d)|(e)(f)", Some(2)), + ("(a)(b)(c)|(d)(e)(f)", Some(3)), + ("(a)(b)(c)(d)(e)(f)", Some(6)), + ("(a)(b)(extra)|(a)(b)()", Some(3)), + ("(a)(b)((?:extra)?)", Some(3)), + ("(a)(b)(extra)?", None), + ("(foo)|(bar)", Some(1)), + ("(foo)(bar)", Some(2)), + ("(foo)+(bar)", Some(2)), + ("(foo)*(bar)", None), + ("(foo)?{0}", Some(0)), + ("(foo)?{1}", None), + ("(foo){1}", Some(1)), + ("(foo){1,}", Some(1)), + ("(foo){1,}?", Some(1)), + ("(foo){0,}", None), + ("(foo)(?:bar)", Some(1)), + ("(foo(?:bar)+)(?:baz(boo))", Some(2)), + ("(?Pfoo)(?:bar)(bal|loon)", Some(2)), + (r"(?:(\w)(\s))?", None), + (r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#, Some(2)), + ]; + for (test_regex, expected) in tests { + let re = regex!(test_regex); + assert_eq!(re.participating_captures_len(), expected, "for regex {test_regex}"); + } +} + #[test] fn capture_misc() { let re = regex!(r"(.)(?Pa)?(.)(?P.)"); From 559606c3af200c7e2236eb055b0c063cf712a2a1 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Sat, 24 Sep 2022 17:57:51 +0200 Subject: [PATCH 3/3] Removed unnecessary unicode-reliant test and ran fmt. --- regex-syntax/src/hir/mod.rs | 8 +++++--- src/compile.rs | 3 ++- src/re_bytes.rs | 2 +- src/re_unicode.rs | 2 +- tests/api.rs | 7 +++++-- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 8c5e4ba9d..a4b7bc4ac 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -579,9 +579,11 @@ impl Hir { let x = info.is_alternation_literal() && e.is_literal(); info.set_alternation_literal(x); } - let mut capture_counts = exprs.iter().map(|e| e.info.static_capture_count); + let mut capture_counts = + exprs.iter().map(|e| e.info.static_capture_count); let first = capture_counts.next().unwrap_or(Some(0)); - info.static_capture_count = capture_counts.fold(first, |a, b| if a == b { a } else { None }); + info.static_capture_count = capture_counts + .fold(first, |a, b| if a == b { a } else { None }); Hir { kind: HirKind::Alternation(exprs), info } } } @@ -732,7 +734,7 @@ impl Hir { pub fn is_alternation_literal(&self) -> bool { self.info.is_alternation_literal() } - + /// Returns the number of captures groups that would participate in a /// successful match of this expression. If this number can not be /// statically determined from the regex this function returns `None`. diff --git a/src/compile.rs b/src/compile.rs index 9aa7e3d29..c7bc66c0b 100644 --- a/src/compile.rs +++ b/src/compile.rs @@ -149,7 +149,8 @@ impl Compiler { self.compiled.start = dotstar_patch.entry; } self.compiled.captures = vec![None]; - self.compiled.participating_captures_len = expr.participating_captures_len(); + self.compiled.participating_captures_len = + expr.participating_captures_len(); let patch = self.c_capture(0, expr)?.unwrap_or_else(|| self.next_inst()); if self.compiled.needs_dotstar() { diff --git a/src/re_bytes.rs b/src/re_bytes.rs index 4983f2f88..cd1594773 100644 --- a/src/re_bytes.rs +++ b/src/re_bytes.rs @@ -666,7 +666,7 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } - + /// Returns the number of participating captures that this regex will /// return on a successful match. If this number can not be statically /// determined from the regex this function returns `None`. diff --git a/src/re_unicode.rs b/src/re_unicode.rs index b6e1f7b3e..29ac6b218 100644 --- a/src/re_unicode.rs +++ b/src/re_unicode.rs @@ -724,7 +724,7 @@ impl Regex { pub fn captures_len(&self) -> usize { self.0.capture_names().len() } - + /// Returns the number of participating captures that this regex will /// return on a successful match. If this number can not be statically /// determined from the regex this function returns `None`. diff --git a/tests/api.rs b/tests/api.rs index 6cd9a4b22..4dc20c4b9 100644 --- a/tests/api.rs +++ b/tests/api.rs @@ -157,12 +157,15 @@ fn participating_captures_len() { ("(foo)(?:bar)", Some(1)), ("(foo(?:bar)+)(?:baz(boo))", Some(2)), ("(?Pfoo)(?:bar)(bal|loon)", Some(2)), - (r"(?:(\w)(\s))?", None), (r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#, Some(2)), ]; for (test_regex, expected) in tests { let re = regex!(test_regex); - assert_eq!(re.participating_captures_len(), expected, "for regex {test_regex}"); + assert_eq!( + re.participating_captures_len(), + expected, + "for regex {test_regex}" + ); } }