Skip to content

Commit e5e343a

Browse files
committed
Finished unicode support in the model lexer.
Completed XID_Start and XID_Continue rules
1 parent be43713 commit e5e343a

File tree

4 files changed

+229
-18
lines changed

4 files changed

+229
-18
lines changed

src/grammar/RustLexer.g4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ fragment SUFFIX
9393
;
9494
9595
LIT_CHAR
96-
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] ) '\'' SUFFIX?
96+
: '\'' ( '\\' CHAR_ESCAPE | ~[\\'\n\t\r] | '\ud800' .. '\udbff' '\udc00' .. '\udfff' ) '\'' SUFFIX?
9797
;
9898

9999
LIT_BYTE

src/grammar/verify.rs

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010

1111
#![feature(plugin)]
1212

13+
#![allow(unstable)]
14+
1315
extern crate syntax;
1416
extern crate rustc;
1517

@@ -164,7 +166,8 @@ fn count(lit: &str) -> usize {
164166
lit.chars().take_while(|c| *c == '#').count()
165167
}
166168

167-
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAndSpan {
169+
fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>, surrogate_pairs_pos: &[usize])
170+
-> TokenAndSpan {
168171
// old regex:
169172
// \[@(?P<seq>\d+),(?P<start>\d+):(?P<end>\d+)='(?P<content>.+?)',<(?P<toknum>-?\d+)>,\d+:\d+]
170173
let start = s.find_str("[@").unwrap();
@@ -213,9 +216,16 @@ fn parse_antlr_token(s: &str, tokens: &HashMap<String, token::Token>) -> TokenAn
213216
0
214217
};
215218

219+
let mut lo = start.parse::<u32>().unwrap() - offset;
220+
let mut hi = end.parse::<u32>().unwrap() + 1;
221+
222+
// Adjust the span: For each surrogate pair already encountered, subtract one position.
223+
lo -= surrogate_pairs_pos.binary_search(&(lo as usize)).unwrap_or_else(|x| x) as u32;
224+
hi -= surrogate_pairs_pos.binary_search(&(hi as usize)).unwrap_or_else(|x| x) as u32;
225+
216226
let sp = syntax::codemap::Span {
217-
lo: syntax::codemap::BytePos(start.parse::<u32>().unwrap() - offset),
218-
hi: syntax::codemap::BytePos(end.parse::<u32>().unwrap() + 1),
227+
lo: syntax::codemap::BytePos(lo),
228+
hi: syntax::codemap::BytePos(hi),
219229
expn_id: syntax::codemap::NO_EXPANSION
220230
};
221231

@@ -235,11 +245,10 @@ fn tok_cmp(a: &token::Token, b: &token::Token) -> bool {
235245
}
236246
}
237247

238-
fn span_cmp(rust_sp: syntax::codemap::Span, antlr_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
239-
println!("{} {}", cm.bytepos_to_file_charpos(rust_sp.lo).to_uint(), cm.bytepos_to_file_charpos(rust_sp.hi).to_uint());
240-
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
241-
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint() &&
242-
antlr_sp.expn_id == rust_sp.expn_id
248+
fn span_cmp(antlr_sp: syntax::codemap::Span, rust_sp: syntax::codemap::Span, cm: &syntax::codemap::CodeMap) -> bool {
249+
antlr_sp.expn_id == rust_sp.expn_id &&
250+
antlr_sp.lo.to_uint() == cm.bytepos_to_file_charpos(rust_sp.lo).to_uint() &&
251+
antlr_sp.hi.to_uint() == cm.bytepos_to_file_charpos(rust_sp.hi).to_uint()
243252
}
244253

245254
fn main() {
@@ -250,16 +259,18 @@ fn main() {
250259

251260
let args = std::os::args();
252261

253-
let mut token_file = File::open(&Path::new(args[2]));
254-
let token_map = parse_token_list(token_file.read_to_string().unwrap());
262+
// Rust's lexer
263+
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
255264

256-
let mut stdin = std::io::stdin();
257-
let mut lock = stdin.lock();
258-
let lines = lock.lines();
259-
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
260-
&token_map));
265+
let surrogate_pairs_pos: Vec<usize> = code.chars().enumerate()
266+
.filter(|&(_, c)| c as usize > 0xFFFF)
267+
.map(|(n, _)| n)
268+
.enumerate()
269+
.map(|(x, n)| x + n)
270+
.collect();
271+
272+
debug!("Pairs: {:?}", surrogate_pairs_pos);
261273

262-
let code = File::open(&Path::new(args[1])).unwrap().read_to_string().unwrap();
263274
let options = config::basic_options();
264275
let session = session::build_session(options, None,
265276
syntax::diagnostics::registry::Registry::new(&[]));
@@ -269,13 +280,25 @@ fn main() {
269280
let mut lexer = lexer::StringReader::new(session.diagnostic(), filemap);
270281
let ref cm = lexer.span_diagnostic.cm;
271282

283+
// ANTLR
284+
let mut token_file = File::open(&Path::new(args[2]));
285+
let token_map = parse_token_list(token_file.read_to_string().unwrap());
286+
287+
let mut stdin = std::io::stdin();
288+
let mut lock = stdin.lock();
289+
let lines = lock.lines();
290+
let mut antlr_tokens = lines.map(|l| parse_antlr_token(l.unwrap().trim(),
291+
&token_map,
292+
&surrogate_pairs_pos[]));
293+
272294
for antlr_tok in antlr_tokens {
273295
let rustc_tok = next(&mut lexer);
274296
if rustc_tok.tok == token::Eof && antlr_tok.tok == token::Eof {
275297
continue
276298
}
277299

278-
assert!(span_cmp(rustc_tok.sp, antlr_tok.sp, cm), "{:?} and {:?} have different spans", rustc_tok,
300+
assert!(span_cmp(antlr_tok.sp, rustc_tok.sp, cm), "{:?} and {:?} have different spans",
301+
rustc_tok,
279302
antlr_tok);
280303

281304
macro_rules! matches {

src/grammar/xidcontinue.g4

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,4 +372,102 @@ fragment XID_Continue:
372372
| '\uffca' .. '\uffcf'
373373
| '\uffd2' .. '\uffd7'
374374
| '\uffda' .. '\uffdc'
375+
| '\ud800' '\udc00' .. '\udc0a'
376+
| '\ud800' '\udc0d' .. '\udc25'
377+
| '\ud800' '\udc28' .. '\udc39'
378+
| '\ud800' '\udc3c' .. '\udc3c'
379+
| '\ud800' '\udc3f' .. '\udc4c'
380+
| '\ud800' '\udc50' .. '\udc5c'
381+
| '\ud800' '\udc80' .. '\udcf9'
382+
| '\ud800' '\udf00' .. '\udf1d'
383+
| '\ud800' '\udf30' .. '\udf49'
384+
| '\ud800' '\udf80' .. '\udf9c'
385+
| '\ud801' '\ue000' .. '\ue09c'
386+
| '\ud801' '\ue0a0' .. '\ue0a8'
387+
| '\ud802' '\ue400' .. '\ue404'
388+
| '\ud802' '\u0808'
389+
| '\ud802' '\ue40a' .. '\ue434'
390+
| '\ud802' '\ue437' .. '\ue437'
391+
| '\ud802' '\u083c'
392+
| '\ud802' '\u083f'
393+
| '\ud834' '\uad65' .. '\uad68'
394+
| '\ud834' '\uad6d' .. '\uad71'
395+
| '\ud834' '\uad7b' .. '\uad81'
396+
| '\ud834' '\uad85' .. '\uad8a'
397+
| '\ud834' '\uadaa' .. '\uadac'
398+
| '\ud835' '\ub000' .. '\ub053'
399+
| '\ud835' '\ub056' .. '\ub09b'
400+
| '\ud835' '\ub09e' .. '\ub09e'
401+
| '\ud835' '\ud4a2'
402+
| '\ud835' '\ub0a5' .. '\ub0a5'
403+
| '\ud835' '\ub0a9' .. '\ub0ab'
404+
| '\ud835' '\ub0ae' .. '\ub0b8'
405+
| '\ud835' '\ud4bb'
406+
| '\ud835' '\ub0bd' .. '\ub0c2'
407+
| '\ud835' '\ub0c5' .. '\ub104'
408+
| '\ud835' '\ub107' .. '\ub109'
409+
| '\ud835' '\ub10d' .. '\ub113'
410+
| '\ud835' '\ub116' .. '\ub11b'
411+
| '\ud835' '\ub11e' .. '\ub138'
412+
| '\ud835' '\ub13b' .. '\ub13d'
413+
| '\ud835' '\ub140' .. '\ub143'
414+
| '\ud835' '\ud546'
415+
| '\ud835' '\ub14a' .. '\ub14f'
416+
| '\ud835' '\ub152' .. '\ub2a2'
417+
| '\ud835' '\ub2a8' .. '\ub2bf'
418+
| '\ud835' '\ub2c2' .. '\ub2d9'
419+
| '\ud835' '\ub2dc' .. '\ub2f9'
420+
| '\ud835' '\ub2fc' .. '\ub313'
421+
| '\ud835' '\ub316' .. '\ub333'
422+
| '\ud835' '\ub336' .. '\ub34d'
423+
| '\ud835' '\ub350' .. '\ub36d'
424+
| '\ud835' '\ub370' .. '\ub387'
425+
| '\ud835' '\ub38a' .. '\ub3a7'
426+
| '\ud835' '\ub3aa' .. '\ub3c1'
427+
| '\ud835' '\ub3c4' .. '\ub3c8'
428+
| '\ud835' '\ub3ce' .. '\ub3fe'
429+
| '\ud840' '\udc00' .. '\udffe'
430+
| '\ud841' '\ue000' .. '\ue3fe'
431+
| '\ud842' '\ue400' .. '\ue7fe'
432+
| '\ud843' '\ue800' .. '\uebfe'
433+
| '\ud844' '\uec00' .. '\ueffe'
434+
| '\ud845' '\uf000' .. '\uf3fe'
435+
| '\ud846' '\uf400' .. '\uf7fe'
436+
| '\ud847' '\uf800' .. '\ufbfe'
437+
| '\ud848' '\ufc00' .. '\ufffe'
438+
| '\ud849' '\u0000' .. '\u03fe'
439+
| '\ud84a' '\u0400' .. '\u07fe'
440+
| '\ud84b' '\u0800' .. '\u0bfe'
441+
| '\ud84c' '\u0c00' .. '\u0ffe'
442+
| '\ud84d' '\u1000' .. '\u13fe'
443+
| '\ud84e' '\u1400' .. '\u17fe'
444+
| '\ud84f' '\u1800' .. '\u1bfe'
445+
| '\ud850' '\u1c00' .. '\u1ffe'
446+
| '\ud851' '\u2000' .. '\u23fe'
447+
| '\ud852' '\u2400' .. '\u27fe'
448+
| '\ud853' '\u2800' .. '\u2bfe'
449+
| '\ud854' '\u2c00' .. '\u2ffe'
450+
| '\ud855' '\u3000' .. '\u33fe'
451+
| '\ud856' '\u3400' .. '\u37fe'
452+
| '\ud857' '\u3800' .. '\u3bfe'
453+
| '\ud858' '\u3c00' .. '\u3ffe'
454+
| '\ud859' '\u4000' .. '\u43fe'
455+
| '\ud85a' '\u4400' .. '\u47fe'
456+
| '\ud85b' '\u4800' .. '\u4bfe'
457+
| '\ud85c' '\u4c00' .. '\u4ffe'
458+
| '\ud85d' '\u5000' .. '\u53fe'
459+
| '\ud85e' '\u5400' .. '\u57fe'
460+
| '\ud85f' '\u5800' .. '\u5bfe'
461+
| '\ud860' '\u5c00' .. '\u5ffe'
462+
| '\ud861' '\u6000' .. '\u63fe'
463+
| '\ud862' '\u6400' .. '\u67fe'
464+
| '\ud863' '\u6800' .. '\u6bfe'
465+
| '\ud864' '\u6c00' .. '\u6ffe'
466+
| '\ud865' '\u7000' .. '\u73fe'
467+
| '\ud866' '\u7400' .. '\u77fe'
468+
| '\ud867' '\u7800' .. '\u7bfe'
469+
| '\ud868' '\u7c00' .. '\u7ffe'
470+
| '\ud869' '\u8000' .. '\u82d5'
471+
| '\ud87e' '\ud400' .. '\ud61c'
472+
| '\udb40' '\udd00' .. '\uddee'
375473
;

src/grammar/xidstart.g4

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,4 +286,94 @@ fragment XID_Start :
286286
| '\uffca' .. '\uffcf'
287287
| '\uffd2' .. '\uffd7'
288288
| '\uffda' .. '\uffdc'
289+
| '\ud800' '\udc00' .. '\udc0a'
290+
| '\ud800' '\udc0d' .. '\udc25'
291+
| '\ud800' '\udc28' .. '\udc39'
292+
| '\ud800' '\udc3c' .. '\udc3c'
293+
| '\ud800' '\udc3f' .. '\udc4c'
294+
| '\ud800' '\udc50' .. '\udc5c'
295+
| '\ud800' '\udc80' .. '\udcf9'
296+
| '\ud800' '\udf00' .. '\udf1d'
297+
| '\ud800' '\udf30' .. '\udf49'
298+
| '\ud800' '\udf80' .. '\udf9c'
299+
| '\ud801' '\ue000' .. '\ue09c'
300+
| '\ud802' '\ue400' .. '\ue404'
301+
| '\ud802' '\u0808'
302+
| '\ud802' '\ue40a' .. '\ue434'
303+
| '\ud802' '\ue437' .. '\ue437'
304+
| '\ud802' '\u083c'
305+
| '\ud802' '\u083f'
306+
| '\ud835' '\ub000' .. '\ub053'
307+
| '\ud835' '\ub056' .. '\ub09b'
308+
| '\ud835' '\ub09e' .. '\ub09e'
309+
| '\ud835' '\ud4a2'
310+
| '\ud835' '\ub0a5' .. '\ub0a5'
311+
| '\ud835' '\ub0a9' .. '\ub0ab'
312+
| '\ud835' '\ub0ae' .. '\ub0b8'
313+
| '\ud835' '\ud4bb'
314+
| '\ud835' '\ub0bd' .. '\ub0c2'
315+
| '\ud835' '\ub0c5' .. '\ub104'
316+
| '\ud835' '\ub107' .. '\ub109'
317+
| '\ud835' '\ub10d' .. '\ub113'
318+
| '\ud835' '\ub116' .. '\ub11b'
319+
| '\ud835' '\ub11e' .. '\ub138'
320+
| '\ud835' '\ub13b' .. '\ub13d'
321+
| '\ud835' '\ub140' .. '\ub143'
322+
| '\ud835' '\ud546'
323+
| '\ud835' '\ub14a' .. '\ub14f'
324+
| '\ud835' '\ub152' .. '\ub2a2'
325+
| '\ud835' '\ub2a8' .. '\ub2bf'
326+
| '\ud835' '\ub2c2' .. '\ub2d9'
327+
| '\ud835' '\ub2dc' .. '\ub2f9'
328+
| '\ud835' '\ub2fc' .. '\ub313'
329+
| '\ud835' '\ub316' .. '\ub333'
330+
| '\ud835' '\ub336' .. '\ub34d'
331+
| '\ud835' '\ub350' .. '\ub36d'
332+
| '\ud835' '\ub370' .. '\ub387'
333+
| '\ud835' '\ub38a' .. '\ub3a7'
334+
| '\ud835' '\ub3aa' .. '\ub3c1'
335+
| '\ud835' '\ub3c4' .. '\ub3c8'
336+
| '\ud840' '\udc00' .. '\udffe'
337+
| '\ud841' '\ue000' .. '\ue3fe'
338+
| '\ud842' '\ue400' .. '\ue7fe'
339+
| '\ud843' '\ue800' .. '\uebfe'
340+
| '\ud844' '\uec00' .. '\ueffe'
341+
| '\ud845' '\uf000' .. '\uf3fe'
342+
| '\ud846' '\uf400' .. '\uf7fe'
343+
| '\ud847' '\uf800' .. '\ufbfe'
344+
| '\ud848' '\ufc00' .. '\ufffe'
345+
| '\ud849' '\u0000' .. '\u03fe'
346+
| '\ud84a' '\u0400' .. '\u07fe'
347+
| '\ud84b' '\u0800' .. '\u0bfe'
348+
| '\ud84c' '\u0c00' .. '\u0ffe'
349+
| '\ud84d' '\u1000' .. '\u13fe'
350+
| '\ud84e' '\u1400' .. '\u17fe'
351+
| '\ud84f' '\u1800' .. '\u1bfe'
352+
| '\ud850' '\u1c00' .. '\u1ffe'
353+
| '\ud851' '\u2000' .. '\u23fe'
354+
| '\ud852' '\u2400' .. '\u27fe'
355+
| '\ud853' '\u2800' .. '\u2bfe'
356+
| '\ud854' '\u2c00' .. '\u2ffe'
357+
| '\ud855' '\u3000' .. '\u33fe'
358+
| '\ud856' '\u3400' .. '\u37fe'
359+
| '\ud857' '\u3800' .. '\u3bfe'
360+
| '\ud858' '\u3c00' .. '\u3ffe'
361+
| '\ud859' '\u4000' .. '\u43fe'
362+
| '\ud85a' '\u4400' .. '\u47fe'
363+
| '\ud85b' '\u4800' .. '\u4bfe'
364+
| '\ud85c' '\u4c00' .. '\u4ffe'
365+
| '\ud85d' '\u5000' .. '\u53fe'
366+
| '\ud85e' '\u5400' .. '\u57fe'
367+
| '\ud85f' '\u5800' .. '\u5bfe'
368+
| '\ud860' '\u5c00' .. '\u5ffe'
369+
| '\ud861' '\u6000' .. '\u63fe'
370+
| '\ud862' '\u6400' .. '\u67fe'
371+
| '\ud863' '\u6800' .. '\u6bfe'
372+
| '\ud864' '\u6c00' .. '\u6ffe'
373+
| '\ud865' '\u7000' .. '\u73fe'
374+
| '\ud866' '\u7400' .. '\u77fe'
375+
| '\ud867' '\u7800' .. '\u7bfe'
376+
| '\ud868' '\u7c00' .. '\u7ffe'
377+
| '\ud869' '\u8000' .. '\u82d5'
378+
| '\ud87e' '\ud400' .. '\ud61c'
289379
;

0 commit comments

Comments
 (0)