Skip to content

Commit 1592687

Browse files
Move character set parsing logic into the parser
1 parent 7e02b96 commit 1592687

File tree

6 files changed

+133
-39
lines changed

6 files changed

+133
-39
lines changed

assembly/char.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ export const enum Char {
99
LeftParenthesis = 0x28,
1010
RightParenthesis = 0x29,
1111
Asterisk = 0x2a, // "*"
12-
Comma = 0x2c, // "*"
1312
Plus = 0x2b, // "+"
13+
Comma = 0x2c, // "*"
14+
Minus = 0x2d, // "-"
1415
Dot = 0x2e, // "."
1516
Zero = 0x30,
1617
Question = 0x3f, // "?"

assembly/nfa/matcher.ts

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,42 @@ import {
44
CharacterNode,
55
CharacterSetNode,
66
CharacterClassNode,
7+
CharacterRangeNode,
78
} from "../parser/node";
9+
import { Match } from "../regexp";
810

9-
export abstract class Matcher {
10-
abstract matches(code: u32): bool;
11+
export class Matcher {
12+
matches(code: u32): bool {
13+
return false;
14+
}
1115

1216
static fromCharacterClassNode(
1317
node: CharacterClassNode
1418
): CharacterClassMatcher {
1519
return new CharacterClassMatcher(node.charClass);
1620
}
1721

22+
static fromCharacterRangeNode(
23+
node: CharacterRangeNode
24+
): CharacterRangeMatcher {
25+
return new CharacterRangeMatcher(node.from, node.to);
26+
}
27+
1828
static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher {
19-
return new CharacterSetMatcher(node.chars, node.negated);
29+
const matchers = new Array<Matcher>();
30+
for (let i = 0; i < node.expressions.length; i++) {
31+
const exp = node.expressions[i];
32+
if (CharacterRangeNode.is(exp)) {
33+
matchers.push(
34+
Matcher.fromCharacterRangeNode(exp as CharacterRangeNode)
35+
);
36+
} else if (CharacterNode.is(exp)) {
37+
matchers.push(Matcher.fromCharacterNode(exp as CharacterNode));
38+
} else {
39+
throw new Error("unsupported node type within character set");
40+
}
41+
}
42+
return new CharacterSetMatcher(matchers, node.negated);
2043
}
2144

2245
static fromCharacterNode(node: CharacterNode): CharacterMatcher {
@@ -34,6 +57,16 @@ export class CharacterMatcher extends Matcher {
3457
}
3558
}
3659

60+
export class CharacterRangeMatcher extends Matcher {
61+
constructor(public from: u32, public to: u32) {
62+
super();
63+
}
64+
65+
matches(code: u32): bool {
66+
return code >= this.from && code <= this.to;
67+
}
68+
}
69+
3770
export class CharacterClassMatcher extends Matcher {
3871
constructor(public charClass: Char) {
3972
super();
@@ -79,28 +112,20 @@ export class CharacterClassMatcher extends Matcher {
79112
}
80113
}
81114

115+
// no closure support
116+
let _code: u32;
117+
82118
export class CharacterSetMatcher extends Matcher {
83-
constructor(public set: string, public negated: bool) {
119+
constructor(public matchers: Matcher[], public negated: bool) {
84120
super();
85121
}
86122

87-
matchesSet(set: string, code: u32): bool {
88-
for (let i = 0, len = set.length; i < len; i++) {
89-
// TODO - perform the set parsing logic in the constructor?
90-
// TODO - move into the parser?
91-
if (i < len - 2 && set.charCodeAt(i + 1) == 45 /*-*/) {
92-
const from = set.charCodeAt(i) as u32;
93-
const to = set.charCodeAt(i + 2) as u32;
94-
if (code >= from && code <= to) return true;
95-
} else {
96-
if (set.charCodeAt(i) == code) return true;
97-
}
98-
}
99-
return false;
100-
}
101-
102123
matches(code: u32): bool {
103-
const matches = this.matchesSet(this.set, code);
104-
return this.negated ? !matches : matches;
124+
_code = code;
125+
if (!this.negated) {
126+
return this.matchers.some((m) => m.matches(_code));
127+
} else {
128+
return !this.matchers.some((m) => m.matches(_code));
129+
}
105130
}
106131
}

assembly/parser/node.ts

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export const enum NodeType {
99
Character,
1010
CharacterSet,
1111
CharacterClass,
12+
CharacterRange,
1213
Repetition,
1314
RangeRepetition,
1415
Group,
@@ -79,7 +80,7 @@ export class ConcatenationNode extends Node {
7980
}
8081

8182
export class CharacterSetNode extends Node {
82-
constructor(public chars: string, public negated: bool) {
83+
constructor(public expressions: Node[], public negated: bool) {
8384
super(NodeType.CharacterSet);
8485
}
8586

@@ -88,7 +89,24 @@ export class CharacterSetNode extends Node {
8889
}
8990

9091
clone(): Node {
91-
return new CharacterSetNode(this.chars, this.negated);
92+
return new CharacterSetNode(
93+
this.expressions.slice(0).map<Node>((s) => s.clone()),
94+
this.negated
95+
);
96+
}
97+
}
98+
99+
export class CharacterRangeNode extends Node {
100+
constructor(public from: u32, public to: u32) {
101+
super(NodeType.CharacterRange);
102+
}
103+
104+
static is(node: Node): bool {
105+
return node.type == NodeType.CharacterRange;
106+
}
107+
108+
clone(): Node {
109+
return new CharacterRangeNode(this.from, this.to);
92110
}
93111
}
94112

assembly/parser/parser.ts

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
ConcatenationNode,
1212
RepetitionNode,
1313
CharacterSetNode,
14+
CharacterRangeNode,
1415
} from "./node";
1516

1617
function isQuantifier(code: Char): bool {
@@ -117,7 +118,7 @@ export class Parser {
117118
let firstDigit = true;
118119
let digitStr = "";
119120
while (this.more()) {
120-
let token = this.currentToken.charCodeAt(0);
121+
const token = this.currentToken.charCodeAt(0);
121122
if (token == Char.RightParenthesis) break;
122123
if (firstDigit) {
123124
if (isDigit(token)) {
@@ -170,7 +171,7 @@ export class Parser {
170171
private parseSequence(): Node {
171172
let nodes = new Array<Node>();
172173
while (this.more()) {
173-
let token = this.currentToken.charCodeAt(0);
174+
const token = this.currentToken.charCodeAt(0);
174175
if (token == Char.RightParenthesis) break;
175176
// @ts-ignore
176177
if (token == Char.VerticalBar) {
@@ -207,23 +208,38 @@ export class Parser {
207208
return nodes.length > 1 ? new ConcatenationNode(nodes) : nodes[0];
208209
}
209210

211+
private parseCharacterRange(): Node {
212+
const from = this.eatToken();
213+
this.eatToken(Char.Minus);
214+
const to = this.eatToken();
215+
return new CharacterRangeNode(from, to);
216+
}
217+
210218
private parseCharacterSet(): CharacterSetNode {
211-
let chars = "";
212219
this.eatToken(Char.LeftSquareBracket);
213-
const negated = this.currentToken == "^";
220+
const token = this.currentToken.charCodeAt(0);
221+
222+
const negated = token == Char.Caret;
214223
if (negated) {
215224
this.eatToken(Char.Caret);
216225
}
217-
while (
218-
this.currentToken != "]" ||
219-
(chars.length == 0 && this.currentToken == "]")
220-
) {
221-
// TODO characters set can contain character classes
222-
chars += this.currentToken;
223-
this.eatToken();
226+
227+
const nodes = new Array<Node>();
228+
while (this.currentToken != "]" || nodes.length == 0) {
229+
// lookahead for character range
230+
if (
231+
this.cursor + 1 < u32(this.input.length) &&
232+
this.input.charCodeAt(this.cursor + 1) == Char.Minus &&
233+
this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket
234+
) {
235+
nodes.push(this.parseCharacterRange());
236+
} else {
237+
nodes.push(this.parseCharacter());
238+
}
239+
224240
// TODO error if we run out of chars?
225241
}
226242
this.eatToken(Char.RightSquareBracket);
227-
return new CharacterSetNode(chars, negated);
243+
return new CharacterSetNode(nodes, negated);
228244
}
229245
}

assembly/regexp.ts

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1-
import { State, Automata, toNFAFromAST, GroupEndMarkerState } from "./nfa/nfa";
1+
import {
2+
State,
3+
Automata,
4+
toNFAFromAST,
5+
GroupEndMarkerState,
6+
MatcherState,
7+
} from "./nfa/nfa";
28
import { walker as nfaWalker } from "./nfa/walker";
39
import { ConcatenationNode, AssertionNode } from "./parser/node";
410
import { Char } from "./char";
511
import { Parser } from "./parser/parser";
612
import { first, last } from "./util";
713
import { walker as astWalker, expandRepetitions } from "./parser/walker";
14+
import { CharacterMatcher, CharacterSetMatcher, Matcher } from "./nfa/matcher";
815

916
function recursiveBacktrackingSearch(
1017
state: State,
@@ -145,5 +152,32 @@ export class RegExp {
145152
// TODO: do we need this factory function, or can we invoke
146153
// the ctr via the loader?
147154
export function createRegExp(regex: string, flags: string): RegExp {
155+
156+
/* ---------------- */
157+
/*
158+
This block of code is needed to avoid the following runtime error ...
159+
160+
RuntimeError: unreachable
161+
at assembly/nfa/matcher/Matcher#matches@virtual (wasm-function[240]:1)
162+
at assembly/nfa/matcher/CharacterSetMatcher#matches~anonymous|0 (wasm-function[241]:19)
163+
at ~lib/array/Array<assembly/nfa/matcher/Matcher>#some (wasm-function[242]:85)
164+
at assembly/nfa/matcher/CharacterSetMatcher#matches (wasm-function[244]:21)
165+
at assembly/nfa/nfa/MatcherState<assembly/nfa/matcher/CharacterSetMatcher>#matches (wasm-function[245]:8)
166+
at assembly/nfa/nfa/State#matches@virtual (wasm-function[250]:58)
167+
at assembly/regexp/recursiveBacktrackingSearch (wasm-function[184]:121)
168+
at assembly/regexp/recursiveBacktrackingSearch@varargs (wasm-function[185]:56)
169+
at assembly/regexp/RegExp#exec (wasm-function[192]:307)
170+
*/
171+
const matchers = new Array<Matcher>();
172+
matchers.push(new CharacterMatcher(Char.A));
173+
const charMatcher = new CharacterSetMatcher(matchers, false);
174+
const state = new MatcherState<CharacterSetMatcher>(
175+
charMatcher,
176+
new State(true)
177+
);
178+
const char = "a".charCodeAt(0);
179+
const doesMatch = state.matches(char) != null;
180+
/* ---------------- */
181+
148182
return new RegExp(regex, flags);
149183
}

ts/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ globalAny.log = console.log;
55

66
import { RegExp } from "../assembly/regexp";
77

8-
const regexObj = new RegExp("(a*)+");
9-
const match = regexObj.exec("-");
8+
const regexObj = new RegExp("[]a]");
9+
const match = regexObj.exec("]");
1010

1111
console.log(match);

0 commit comments

Comments
 (0)