Skip to content

Commit 34ed94d

Browse files
committed
Lexer: Properly support Unicode 15.1.0
The previous lexer implementation in `Language.Rust.Parser.Lexer` was broken for Unicode characters with sufficiently large codepoints, as the previous implementation incorrectly attempted to port UTF-16–encoded codepoints over to `alex`, which is UTF-8–encoded. Rather than try to fix the previous implementation (which was based on old `rustc` code that is no longer used), this ports the lexer to a new implementation that is based on the Rust `unicode-xid` crate (which is how modern versions of `rustc` lex Unicode characters). Specifically: * This adapts `unicode-xid`'s lexer generation script to generate an `alex`-based lexer instead of a Rust-based one. * The new lexer is generated to support codepoints from Unicode 15.1.0. (It is unclear which exact Unicode version the previous lexer targeted, but given that it was last updated in 2016, it was likely quite an old version.) * I have verified that the new lexer can lex exotic Unicode characters such as `𝑂` and `𐌝` by adding them as regression tests. Fixes #3.
1 parent dfcbae3 commit 34ed94d

File tree

4 files changed

+1196
-411
lines changed

4 files changed

+1196
-411
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,7 @@ sample-sources/
2626
!sample-sources/statement-expressions.rs
2727
!sample-sources/statements.rs
2828
!sample-sources/types.rs
29+
30+
# Unicode-related autogenerated files
31+
DerivedCoreProperties.txt
32+
UnicodeLexer.x

scripts/unicode.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2011-2015 The Rust Project Developers
4+
# 2024 Galois Inc.
5+
#
6+
# This script was originally created by the Rust Project Developers as part of
7+
# the `unicode-xid` crate:
8+
#
9+
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/scripts/unicode.py
10+
#
11+
# See the COPYRIGHT file in the `unicode-xid` crate:
12+
#
13+
# https://github.com/unicode-rs/unicode-xid/blob/b3a2718b062da229c0a50d12281de0e5d8e8cff6/COPYRIGHT
14+
#
15+
# Galois Inc. has modified the script to generate an `alex`-based lexer instead
16+
# of a Rust-based lexer.
17+
#
18+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
19+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
20+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
21+
# option. This file may not be copied, modified, or distributed
22+
# except according to those terms.
23+
24+
import fileinput, re, os, sys
25+
26+
unicode_version = (15, 1, 0)
27+
28+
preamble = '''-- NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
29+
--
30+
-- If you need to update this code, perform the following steps:
31+
--
32+
-- 1. (Optional) Update the value of `unicode_version` in "scripts/unicode.py".
33+
-- 2. Run the "scripts/unicode.py" script.
34+
-- 3. Copy the code (including the comments) in the autogenerated `UnicodeLexer.x` file.
35+
-- 4. Replace the existing autogenerated code here.
36+
'''
37+
38+
postamble = '''-- End of code generated by "scripts/unicode.py".
39+
'''
40+
41+
def unicode_url(f):
42+
return "http://www.unicode.org/Public/%s.%s.%s/ucd/%s" % (unicode_version + (f,))
43+
44+
def fetch(f):
45+
if not os.path.exists(os.path.basename(f)):
46+
os.system("curl -O %s" % unicode_url(f))
47+
48+
if not os.path.exists(os.path.basename(f)):
49+
sys.stderr.write("cannot load %s" % f)
50+
exit(1)
51+
52+
def group_cat(cat):
53+
cat_out = []
54+
letters = sorted(set(cat))
55+
cur_start = letters.pop(0)
56+
cur_end = cur_start
57+
for letter in letters:
58+
assert letter > cur_end, \
59+
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
60+
if letter == cur_end + 1:
61+
cur_end = letter
62+
else:
63+
cat_out.append((cur_start, cur_end))
64+
cur_start = cur_end = letter
65+
cat_out.append((cur_start, cur_end))
66+
return cat_out
67+
68+
def ungroup_cat(cat):
69+
cat_out = []
70+
for (lo, hi) in cat:
71+
while lo <= hi:
72+
cat_out.append(lo)
73+
lo += 1
74+
return cat_out
75+
76+
def format_table_content(f, content, indent):
77+
line = ""
78+
first = True
79+
for chunk in content.split("|"):
80+
line += " " * indent
81+
if first:
82+
line += "= " + chunk
83+
else:
84+
line += "| " + chunk
85+
line += "\n"
86+
first = False
87+
f.write(line + '\n')
88+
89+
def load_properties(f, interestingprops):
90+
fetch(f)
91+
props = {}
92+
re1 = re.compile("^ *([0-9A-F]+) *; *(\w+)")
93+
re2 = re.compile("^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
94+
95+
for line in fileinput.input(os.path.basename(f)):
96+
prop = None
97+
d_lo = 0
98+
d_hi = 0
99+
m = re1.match(line)
100+
if m:
101+
d_lo = m.group(1)
102+
d_hi = m.group(1)
103+
prop = m.group(2)
104+
else:
105+
m = re2.match(line)
106+
if m:
107+
d_lo = m.group(1)
108+
d_hi = m.group(2)
109+
prop = m.group(3)
110+
else:
111+
continue
112+
if interestingprops and prop not in interestingprops:
113+
continue
114+
d_lo = int(d_lo, 16)
115+
d_hi = int(d_hi, 16)
116+
if prop not in props:
117+
props[prop] = []
118+
props[prop].append((d_lo, d_hi))
119+
120+
# optimize if possible
121+
for prop in props:
122+
props[prop] = group_cat(ungroup_cat(props[prop]))
123+
124+
return props
125+
126+
def escape_char(c):
127+
return "\\x%04x" % c
128+
129+
def emit_table(f, name, t_data):
130+
f.write("@%s\n" % name)
131+
data = ""
132+
first = True
133+
for dat in t_data:
134+
if not first:
135+
data += "|"
136+
first = False
137+
if dat[0] == dat[1]:
138+
data += "%s" % escape_char(dat[0])
139+
else:
140+
data += "[%s-%s]" % (escape_char(dat[0]), escape_char(dat[1]))
141+
format_table_content(f, data, 2)
142+
143+
def emit_property_module(f, mod, tbl, emit):
144+
for cat in emit:
145+
emit_table(f, cat, tbl[cat])
146+
147+
if __name__ == "__main__":
148+
r = "UnicodeLexer.x"
149+
if os.path.exists(r):
150+
os.remove(r)
151+
with open(r, "w") as rf:
152+
# write the file's preamble
153+
rf.write(preamble)
154+
155+
# download and parse all the data
156+
rf.write('''
157+
-- Based on Unicode %s.%s.%s, using the following Unicode table:
158+
-- %s
159+
160+
''' % (unicode_version + (unicode_url("DerviedCoreProperties.txt"),)))
161+
162+
want_derived = ["XID_Start", "XID_Continue"]
163+
derived = load_properties("DerivedCoreProperties.txt", want_derived)
164+
emit_property_module(rf, "derived_property", derived, want_derived)
165+
166+
# write the file's postamble
167+
rf.write(postamble)

0 commit comments

Comments
 (0)