Skip to content

Commit 8a8e497

Browse files
committed
Handle CRLF properly in the lexer
The lexer already ignores CRLF in between tokens, but it doesn't properly handle carriage returns inside strings and doc comments. Teach it to treat CRLF as LF inside these tokens, and to disallow carriage returns that are not followed by linefeeds. This includes handling an escaped CRLF inside a regular string token the same way it handles an escaped LF. This is technically a breaking change, as bare carriage returns are no longer allowed, and CRLF sequences are now treated as LF inside strings and doc comments, but it's very unlikely to actually affect any real-world code. This change is necessary to have Rust code compile on Windows the same way it does on Unix. The mozilla/rust repository explicitly sets eol=lf for Rust source files, but other Rust repositories don't. Notably, rust-http cannot be compiled on Windows without converting the CRLF line endings back to LF. [breaking-change]
1 parent d41058e commit 8a8e497

File tree

5 files changed

+215
-23
lines changed

5 files changed

+215
-23
lines changed

src/libsyntax/parse/lexer/mod.rs

+118-23
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,47 @@ impl<'a> StringReader<'a> {
225225
self.byte_offset(end).to_uint()))
226226
}
227227

228+
/// Converts CRLF to LF in the given string, raising an error on bare CR.
229+
fn translate_crlf<'a>(&self, start: BytePos,
230+
s: &'a str, errmsg: &'a str) -> str::MaybeOwned<'a> {
231+
let mut i = 0u;
232+
while i < s.len() {
233+
let str::CharRange { ch, next } = s.char_range_at(i);
234+
if ch == '\r' {
235+
if next < s.len() && s.char_at(next) == '\n' {
236+
return translate_crlf_(self, start, s, errmsg, i).into_maybe_owned();
237+
}
238+
let pos = start + BytePos(i as u32);
239+
let end_pos = start + BytePos(next as u32);
240+
self.err_span_(pos, end_pos, errmsg);
241+
}
242+
i = next;
243+
}
244+
return s.into_maybe_owned();
245+
246+
fn translate_crlf_(rdr: &StringReader, start: BytePos,
247+
s: &str, errmsg: &str, mut i: uint) -> String {
248+
let mut buf = String::with_capacity(s.len());
249+
let mut j = 0;
250+
while i < s.len() {
251+
let str::CharRange { ch, next } = s.char_range_at(i);
252+
if ch == '\r' {
253+
if j < i { buf.push_str(s.slice(j, i)); }
254+
j = next;
255+
if next >= s.len() || s.char_at(next) != '\n' {
256+
let pos = start + BytePos(i as u32);
257+
let end_pos = start + BytePos(next as u32);
258+
rdr.err_span_(pos, end_pos, errmsg);
259+
}
260+
}
261+
i = next;
262+
}
263+
if j < s.len() { buf.push_str(s.slice_from(j)); }
264+
buf
265+
}
266+
}
267+
268+
228269
/// Advance the StringReader by one character. If a newline is
229270
/// discovered, add it to the FileMap's list of line start offsets.
230271
pub fn bump(&mut self) {
@@ -305,7 +346,20 @@ impl<'a> StringReader<'a> {
305346
// line comments starting with "///" or "//!" are doc-comments
306347
if self.curr_is('/') || self.curr_is('!') {
307348
let start_bpos = self.pos - BytePos(3);
308-
while !self.curr_is('\n') && !self.is_eof() {
349+
while !self.is_eof() {
350+
match self.curr.unwrap() {
351+
'\n' => break,
352+
'\r' => {
353+
if self.nextch_is('\n') {
354+
// CRLF
355+
break
356+
} else {
357+
self.err_span_(self.last_pos, self.pos,
358+
"bare CR not allowed in doc-comment");
359+
}
360+
}
361+
_ => ()
362+
}
309363
self.bump();
310364
}
311365
let ret = self.with_str_from(start_bpos, |string| {
@@ -370,6 +424,7 @@ impl<'a> StringReader<'a> {
370424
let start_bpos = self.last_pos - BytePos(2);
371425

372426
let mut level: int = 1;
427+
let mut has_cr = false;
373428
while level > 0 {
374429
if self.is_eof() {
375430
let msg = if is_doc_comment {
@@ -379,25 +434,35 @@ impl<'a> StringReader<'a> {
379434
};
380435
let last_bpos = self.last_pos;
381436
self.fatal_span_(start_bpos, last_bpos, msg);
382-
} else if self.curr_is('/') && self.nextch_is('*') {
383-
level += 1;
384-
self.bump();
385-
self.bump();
386-
} else if self.curr_is('*') && self.nextch_is('/') {
387-
level -= 1;
388-
self.bump();
389-
self.bump();
390-
} else {
391-
self.bump();
392437
}
438+
let n = self.curr.unwrap();
439+
match n {
440+
'/' if self.nextch_is('*') => {
441+
level += 1;
442+
self.bump();
443+
}
444+
'*' if self.nextch_is('/') => {
445+
level -= 1;
446+
self.bump();
447+
}
448+
'\r' => {
449+
has_cr = true;
450+
}
451+
_ => ()
452+
}
453+
self.bump();
393454
}
394455

395456
let res = if is_doc_comment {
396457
self.with_str_from(start_bpos, |string| {
397458
// but comments with only "*"s between two "/"s are not
398459
if !is_block_non_doc_comment(string) {
460+
let string = if has_cr {
461+
self.translate_crlf(start_bpos, string,
462+
"bare CR not allowed in block doc-comment")
463+
} else { string.into_maybe_owned() };
399464
Some(TokenAndSpan{
400-
tok: token::DOC_COMMENT(str_to_ident(string)),
465+
tok: token::DOC_COMMENT(str_to_ident(string.as_slice())),
401466
sp: codemap::mk_sp(start_bpos, self.last_pos)
402467
})
403468
} else {
@@ -675,6 +740,10 @@ impl<'a> StringReader<'a> {
675740
self.consume_whitespace();
676741
return None
677742
},
743+
'\r' if delim == '"' && self.curr_is('\n') => {
744+
self.consume_whitespace();
745+
return None
746+
}
678747
c => {
679748
let last_pos = self.last_pos;
680749
self.err_span_char(
@@ -696,6 +765,15 @@ impl<'a> StringReader<'a> {
696765
else { "character constant must be escaped" },
697766
first_source_char);
698767
}
768+
'\r' => {
769+
if self.curr_is('\n') {
770+
self.bump();
771+
return Some('\n');
772+
} else {
773+
self.err_span_(start, self.last_pos,
774+
"bare CR not allowed in string, use \\r instead");
775+
}
776+
}
699777
_ => if ascii_only && first_source_char > '\x7F' {
700778
let last_pos = self.last_pos;
701779
self.err_span_char(
@@ -1042,28 +1120,45 @@ impl<'a> StringReader<'a> {
10421120
self.bump();
10431121
let content_start_bpos = self.last_pos;
10441122
let mut content_end_bpos;
1123+
let mut has_cr = false;
10451124
'outer: loop {
10461125
if self.is_eof() {
10471126
let last_bpos = self.last_pos;
10481127
self.fatal_span_(start_bpos, last_bpos, "unterminated raw string");
10491128
}
1050-
if self.curr_is('"') {
1051-
content_end_bpos = self.last_pos;
1052-
for _ in range(0, hash_count) {
1053-
self.bump();
1054-
if !self.curr_is('#') {
1055-
continue 'outer;
1129+
//if self.curr_is('"') {
1130+
//content_end_bpos = self.last_pos;
1131+
//for _ in range(0, hash_count) {
1132+
//self.bump();
1133+
//if !self.curr_is('#') {
1134+
//continue 'outer;
1135+
let c = self.curr.unwrap();
1136+
match c {
1137+
'"' => {
1138+
content_end_bpos = self.last_pos;
1139+
for _ in range(0, hash_count) {
1140+
self.bump();
1141+
if !self.curr_is('#') {
1142+
continue 'outer;
1143+
}
10561144
}
1145+
break;
1146+
}
1147+
'\r' => {
1148+
has_cr = true;
10571149
}
1058-
break;
1150+
_ => ()
10591151
}
10601152
self.bump();
10611153
}
10621154
self.bump();
1063-
let str_content = self.with_str_from_to(
1064-
content_start_bpos,
1065-
content_end_bpos,
1066-
str_to_ident);
1155+
let str_content = self.with_str_from_to(content_start_bpos, content_end_bpos, |string| {
1156+
let string = if has_cr {
1157+
self.translate_crlf(content_start_bpos, string,
1158+
"bare CR not allowed in raw string")
1159+
} else { string.into_maybe_owned() };
1160+
str_to_ident(string.as_slice())
1161+
});
10671162
return token::LIT_STR_RAW(str_content, hash_count);
10681163
}
10691164
'-' => {

src/libsyntax/parse/mod.rs

+22
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ mod test {
288288
use owned_slice::OwnedSlice;
289289
use ast;
290290
use abi;
291+
use attr;
292+
use attr::AttrMetaMethods;
291293
use parse::parser::Parser;
292294
use parse::token::{str_to_ident};
293295
use util::parser_testing::{string_to_tts, string_to_parser};
@@ -726,4 +728,24 @@ mod test {
726728
}".to_string());
727729
}
728730

731+
#[test] fn crlf_doc_comments() {
732+
let sess = new_parse_sess();
733+
734+
let name = "<source>".to_string();
735+
let source = "/// doc comment\r\nfn foo() {}".to_string();
736+
let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap();
737+
let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap();
738+
assert_eq!(doc.get(), "/// doc comment");
739+
740+
let source = "/// doc comment\r\n/// line 2\r\nfn foo() {}".to_string();
741+
let item = parse_item_from_source_str(name.clone(), source, Vec::new(), &sess).unwrap();
742+
let docs = item.attrs.iter().filter(|a| a.name().get() == "doc")
743+
.map(|a| a.value_str().unwrap().get().to_string()).collect::<Vec<_>>();
744+
assert_eq!(docs.as_slice(), &["/// doc comment".to_string(), "/// line 2".to_string()]);
745+
746+
let source = "/** doc comment\r\n * with CRLF */\r\nfn foo() {}".to_string();
747+
let item = parse_item_from_source_str(name, source, Vec::new(), &sess).unwrap();
748+
let doc = attr::first_attr_value_str_by_name(item.attrs.as_slice(), "doc").unwrap();
749+
assert_eq!(doc.get(), "/** doc comment\n * with CRLF */");
750+
}
729751
}

src/test/compile-fail/lex-bare-cr-string-literal-doc-comment.rs

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// ignore-tidy-cr
12+
13+
/// doc comment with bare CR: ''
14+
pub fn foo() {}
15+
//~^^ ERROR: bare CR not allowed in doc-comment
16+
17+
/** block doc comment with bare CR: '' */
18+
pub fn bar() {}
19+
//~^^ ERROR: bare CR not allowed in block doc-comment
20+
21+
fn main() {
22+
// the following string literal has a bare CR in it
23+
let _s = "foobar"; //~ ERROR: bare CR not allowed in string
24+
25+
// the following string literal has a bare CR in it
26+
let _s = r"barfoo"; //~ ERROR: bare CR not allowed in raw string
27+
28+
// the following string literal has a bare CR in it
29+
let _s = "foo\bar"; //~ ERROR: unknown character escape: \r
30+
}

src/test/run-pass/.gitattributes

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
lexer-crlf-line-endings-string-literal-doc-comment.rs -text
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// ignore-tidy-cr ignore-license
2+
// ignore-tidy-cr (repeated again because of tidy bug)
3+
// license is ignored because tidy can't handle the CRLF here properly.
4+
5+
// Copyright 2014 The Rust Project Developers. See the COPYRIGHT
6+
// file at the top-level directory of this distribution and at
7+
// http://rust-lang.org/COPYRIGHT.
8+
//
9+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
10+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
11+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
12+
// option. This file may not be copied, modified, or distributed
13+
// except according to those terms.
14+
15+
// NB: this file needs CRLF line endings. The .gitattributes file in
16+
// this directory should enforce it.
17+
18+
// ignore-pretty
19+
20+
/// Doc comment that ends in CRLF
21+
pub fn foo() {}
22+
23+
/** Block doc comment that
24+
* contains CRLF characters
25+
*/
26+
pub fn bar() {}
27+
28+
fn main() {
29+
let s = "string
30+
literal";
31+
assert_eq!(s, "string\nliteral");
32+
33+
let s = "literal with \
34+
escaped newline";
35+
assert_eq!(s, "literal with escaped newline");
36+
37+
let s = r"string
38+
literal";
39+
assert_eq!(s, "string\nliteral");
40+
41+
// validate that our source file has CRLF endings
42+
let source = include_str!("lexer-crlf-line-endings-string-literal-doc-comment.rs");
43+
assert!(source.contains("string\r\nliteral"));
44+
}

0 commit comments

Comments
 (0)