Skip to content

Commit 126f9ef

Browse files
committed
Add multiaddr expression group matching
Support captures export some things wip thinking about public API Think about exposing meg as a public API doc comments Finish rename Add helper for meg and add test add comment for devs
1 parent 3ca4833 commit 126f9ef

File tree

6 files changed

+545
-0
lines changed

6 files changed

+545
-0
lines changed

component.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ func (c Component) Protocol() Protocol {
121121
return c.protocol
122122
}
123123

124+
func (c Component) Code() int {
125+
return c.Protocol().Code
126+
}
127+
124128
func (c Component) RawValue() []byte {
125129
return []byte(c.bytes[c.offset:])
126130
}

meg/meg.go

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
// package meg implements Regular Expressions for multiaddr Components. It's short for "Megular Expressions"
2+
package meg
3+
4+
// The developer is assumed to be familiar with the Thompson NFA approach to
5+
// regex before making changes to this file. Refer to
6+
// https://swtch.com/~rsc/regexp/regexp1.html for an introduction.
7+
8+
import (
9+
"fmt"
10+
"slices"
11+
)
12+
13+
type stateKind uint8
14+
15+
const (
16+
matchCode stateKind = iota
17+
split
18+
done
19+
)
20+
21+
// MatchState is the Thompson NFA for a regular expression.
22+
type MatchState struct {
23+
capture captureFunc
24+
next *MatchState
25+
nextSplit *MatchState
26+
27+
kind stateKind
28+
generation int
29+
code int
30+
}
31+
32+
type captureFunc *func(string) error
33+
type captureMap map[captureFunc][]string
34+
35+
func (cm captureMap) clone() captureMap {
36+
out := make(captureMap, len(cm))
37+
for k, v := range cm {
38+
out[k] = slices.Clone(v)
39+
}
40+
return out
41+
}
42+
43+
type statesAndCaptures struct {
44+
states []*MatchState
45+
captures []captureMap
46+
}
47+
48+
func (s *MatchState) String() string {
49+
return fmt.Sprintf("state{kind: %d, generation: %d, code: %d}", s.kind, s.generation, s.code)
50+
}
51+
52+
type Matchable interface {
53+
Code() int
54+
Value() string // Used when capturing the value
55+
}
56+
57+
// Match returns whether the given Components match the Pattern defined in MatchState.
58+
// Errors are used to communicate capture errors.
59+
// If the error is non-nil the returned bool will be false.
60+
func Match[S ~[]T, T Matchable](s *MatchState, components S) (bool, error) {
61+
listGeneration := s.generation + 1 // Start at the last generation + 1
62+
defer func() { s.generation = listGeneration }() // In case we reuse this state, store our highest generation number
63+
64+
currentStates := statesAndCaptures{
65+
states: make([]*MatchState, 0, 16),
66+
captures: make([]captureMap, 0, 16),
67+
}
68+
nextStates := statesAndCaptures{
69+
states: make([]*MatchState, 0, 16),
70+
captures: make([]captureMap, 0, 16),
71+
}
72+
73+
currentStates = appendState(currentStates, s, nil, listGeneration)
74+
75+
for _, c := range components {
76+
if len(currentStates.states) == 0 {
77+
return false, nil
78+
}
79+
for i, s := range currentStates.states {
80+
if s.kind == matchCode && s.code == c.Code() {
81+
cm := currentStates.captures[i]
82+
if s.capture != nil {
83+
cm[s.capture] = append(cm[s.capture], c.Value())
84+
}
85+
nextStates = appendState(nextStates, s.next, currentStates.captures[i], listGeneration)
86+
}
87+
}
88+
currentStates, nextStates = nextStates, currentStates
89+
nextStates.states = nextStates.states[:0]
90+
nextStates.captures = nextStates.captures[:0]
91+
listGeneration++
92+
}
93+
94+
for i, s := range currentStates.states {
95+
if s.kind == done {
96+
// We found a complete path. Run the captures now
97+
for f, v := range currentStates.captures[i] {
98+
for _, s := range v {
99+
if err := (*f)(s); err != nil {
100+
return false, err
101+
}
102+
}
103+
}
104+
return true, nil
105+
}
106+
}
107+
return false, nil
108+
}
109+
110+
func appendState(arr statesAndCaptures, s *MatchState, c captureMap, listGeneration int) statesAndCaptures {
111+
if s == nil || s.generation == listGeneration {
112+
return arr
113+
}
114+
if c == nil {
115+
c = make(captureMap)
116+
}
117+
s.generation = listGeneration
118+
if s.kind == split {
119+
arr = appendState(arr, s.next, c, listGeneration)
120+
arr = appendState(arr, s.nextSplit, c.clone(), listGeneration)
121+
} else {
122+
arr.states = append(arr.states, s)
123+
arr.captures = append(arr.captures, c)
124+
}
125+
return arr
126+
}

meg/meg_test.go

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
package meg
2+
3+
import (
4+
"regexp"
5+
"slices"
6+
"testing"
7+
"testing/quick"
8+
)
9+
10+
type codeAndValue struct {
11+
code int
12+
val string // Uses the string type to ensure immutability.
13+
}
14+
15+
// Code implements Matchable.
16+
func (c codeAndValue) Code() int {
17+
return c.code
18+
}
19+
20+
// Value implements Matchable.
21+
func (c codeAndValue) Value() string {
22+
return c.val
23+
}
24+
25+
var _ Matchable = codeAndValue{}
26+
27+
func TestSimple(t *testing.T) {
28+
type testCase struct {
29+
pattern *MatchState
30+
skipQuickCheck bool
31+
shouldMatch [][]int
32+
shouldNotMatch [][]int
33+
}
34+
testCases :=
35+
[]testCase{
36+
{
37+
pattern: PatternToMatchState(Val(0), Val(1)),
38+
shouldMatch: [][]int{{0, 1}},
39+
shouldNotMatch: [][]int{
40+
{0},
41+
{0, 0},
42+
{0, 1, 0},
43+
}}, {
44+
pattern: PatternToMatchState(Val(0), Val(1), Optional(Val(2))),
45+
shouldMatch: [][]int{
46+
{0, 1, 2},
47+
{0, 1},
48+
},
49+
shouldNotMatch: [][]int{
50+
{0},
51+
{0, 0},
52+
{0, 1, 0},
53+
{0, 1, 2, 0},
54+
}}, {
55+
pattern: PatternToMatchState(Val(0), Val(1), OneOrMore(2)),
56+
skipQuickCheck: true,
57+
shouldMatch: [][]int{
58+
{0, 1, 2, 2, 2, 2},
59+
{0, 1, 2},
60+
},
61+
shouldNotMatch: [][]int{
62+
{0},
63+
{0, 0},
64+
{0, 1},
65+
{0, 1, 0},
66+
{0, 1, 1, 0},
67+
{0, 1, 2, 0},
68+
}},
69+
}
70+
71+
for i, tc := range testCases {
72+
for _, m := range tc.shouldMatch {
73+
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); !matches {
74+
t.Fatalf("failed to match %v with %s. idx=%d", m, tc.pattern, i)
75+
}
76+
}
77+
for _, m := range tc.shouldNotMatch {
78+
if matches, _ := Match(tc.pattern, codesToCodeAndValue(m)); matches {
79+
t.Fatalf("failed to not match %v with %s. idx=%d", m, tc.pattern, i)
80+
}
81+
}
82+
if tc.skipQuickCheck {
83+
continue
84+
}
85+
if err := quick.Check(func(notMatch []int) bool {
86+
for _, shouldMatch := range tc.shouldMatch {
87+
if slices.Equal(notMatch, shouldMatch) {
88+
// The random `notMatch` is actually something that shouldMatch. Skip it.
89+
return true
90+
}
91+
}
92+
matches, _ := Match(tc.pattern, codesToCodeAndValue(notMatch))
93+
return !matches
94+
}, &quick.Config{}); err != nil {
95+
t.Fatal(err)
96+
}
97+
}
98+
}
99+
100+
func TestCapture(t *testing.T) {
101+
type setupStateAndAssert func() (*MatchState, func())
102+
type testCase struct {
103+
setup setupStateAndAssert
104+
parts []codeAndValue
105+
}
106+
107+
testCases :=
108+
[]testCase{
109+
{
110+
setup: func() (*MatchState, func()) {
111+
var code0str string
112+
return PatternToMatchState(CaptureVal(0, &code0str), Val(1)), func() {
113+
if code0str != "hello" {
114+
panic("unexpected value")
115+
}
116+
}
117+
},
118+
parts: []codeAndValue{{0, "hello"}, {1, "world"}},
119+
},
120+
{
121+
setup: func() (*MatchState, func()) {
122+
var code0strs []string
123+
return PatternToMatchState(CaptureOneOrMore(0, &code0strs), Val(1)), func() {
124+
if code0strs[0] != "hello" {
125+
panic("unexpected value")
126+
}
127+
if code0strs[1] != "world" {
128+
panic("unexpected value")
129+
}
130+
}
131+
},
132+
parts: []codeAndValue{{0, "hello"}, {0, "world"}, {1, ""}},
133+
},
134+
}
135+
136+
_ = testCases
137+
for _, tc := range testCases {
138+
state, assert := tc.setup()
139+
if matches, _ := Match(state, tc.parts); !matches {
140+
t.Fatalf("failed to match %v with %s", tc.parts, state)
141+
}
142+
assert()
143+
}
144+
}
145+
146+
func codesToCodeAndValue(codes []int) []codeAndValue {
147+
out := make([]codeAndValue, len(codes))
148+
for i, c := range codes {
149+
out[i] = codeAndValue{code: c}
150+
}
151+
return out
152+
}
153+
154+
func bytesToCodeAndValue(codes []byte) []codeAndValue {
155+
out := make([]codeAndValue, len(codes))
156+
for i, c := range codes {
157+
out[i] = codeAndValue{code: int(c)}
158+
}
159+
return out
160+
}
161+
162+
// FuzzMatchesRegexpBehavior fuzz tests the expression matcher by comparing it to the behavior of the regexp package.
163+
func FuzzMatchesRegexpBehavior(f *testing.F) {
164+
bytesToRegexpAndPattern := func(exp []byte) ([]byte, []Pattern) {
165+
if len(exp) < 3 {
166+
panic("regexp too short")
167+
}
168+
pattern := make([]Pattern, 0, len(exp)-2)
169+
for i, b := range exp {
170+
b = b % 32
171+
if i == 0 {
172+
exp[i] = '^'
173+
continue
174+
} else if i == len(exp)-1 {
175+
exp[i] = '$'
176+
continue
177+
}
178+
switch {
179+
case b < 26:
180+
exp[i] = b + 'a'
181+
pattern = append(pattern, Val(int(exp[i])))
182+
case i > 1 && b == 26:
183+
exp[i] = '?'
184+
pattern = pattern[:len(pattern)-1]
185+
pattern = append(pattern, Optional(Val(int(exp[i-1]))))
186+
case i > 1 && b == 27:
187+
exp[i] = '*'
188+
pattern = pattern[:len(pattern)-1]
189+
pattern = append(pattern, ZeroOrMore(int(exp[i-1])))
190+
case i > 1 && b == 28:
191+
exp[i] = '+'
192+
pattern = pattern[:len(pattern)-1]
193+
pattern = append(pattern, OneOrMore(int(exp[i-1])))
194+
default:
195+
exp[i] = 'a'
196+
pattern = append(pattern, Val(int(exp[i])))
197+
}
198+
}
199+
200+
return exp, pattern
201+
}
202+
203+
simplifyB := func(buf []byte) []byte {
204+
for i, b := range buf {
205+
buf[i] = (b % 26) + 'a'
206+
}
207+
return buf
208+
}
209+
210+
f.Fuzz(func(t *testing.T, expRules []byte, corpus []byte) {
211+
if len(expRules) < 3 || len(expRules) > 1024 || len(corpus) > 1024 {
212+
return
213+
}
214+
corpus = simplifyB(corpus)
215+
regexpPattern, pattern := bytesToRegexpAndPattern(expRules)
216+
matched, err := regexp.Match(string(regexpPattern), corpus)
217+
if err != nil {
218+
// Malformed regex. Ignore
219+
return
220+
}
221+
p := PatternToMatchState(pattern...)
222+
otherMatched, _ := Match(p, bytesToCodeAndValue(corpus))
223+
if otherMatched != matched {
224+
t.Log("regexp", string(regexpPattern))
225+
t.Log("corpus", string(corpus))
226+
m2, err2 := regexp.Match(string(regexpPattern), corpus)
227+
t.Logf("regexp matched %v. %v. %v, %v. \n%v - \n%v", matched, err, m2, err2, regexpPattern, corpus)
228+
t.Logf("pattern %+v", pattern)
229+
t.Fatalf("mismatched results: %v %v %v", otherMatched, matched, p)
230+
}
231+
})
232+
233+
}

0 commit comments

Comments
 (0)