From 096d39d4051af8e918c9d8d77554487525ac48d7 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:51 +0100 Subject: [PATCH 1/7] Better filter trivia in dumps Make sure we don't try and print things like empty comma lists `,,,,` or redundant parens for concatenations that had their trivia filtered out. --- .../_RegexParser/Regex/Printing/DumpAST.swift | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 47142407a..0e40ad2ce 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -44,18 +44,23 @@ extension _ASTPrintable { guard let children = _children else { return _dumpBase } - let sub = children.lazy.compactMap { + let childDump = children.compactMap { child -> String? in // Exclude trivia for now, as we don't want it to appear when performing // comparisons of dumped output in tests. // TODO: We should eventually have some way of filtering out trivia for // tests, so that it can appear in regular dumps. - if $0.isTrivia { return nil } - return $0._dump() - }.joined(separator: ",") - if sub.isEmpty { - return "\(_dumpBase)" + if child.isTrivia { return nil } + let dump = child._dump() + return !dump.isEmpty ? dump : nil } - return "\(_dumpBase)(\(sub))" + let base = "\(_dumpBase)" + if childDump.isEmpty { + return base + } + if childDump.count == 1, base.isEmpty { + return "\(childDump[0])" + } + return "\(base)(\(childDump.joined(separator: ",")))" } } @@ -77,7 +82,7 @@ extension AST.Node: _ASTPrintable { } extension AST.Alternation { - public var _dumpBase: String { "alternation" } + public var _dumpBase: String { "alternation<\(children.count)>" } } extension AST.Concatenation { From c6dc547908bd3aab852e04c47286a651b31d8c00 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:51 +0100 Subject: [PATCH 2/7] Formalize non-semantic whitespace matching Turns out this is a Unicode-defined thing. --- .../Regex/Parse/LexicalAnalysis.swift | 26 ++++--------------- .../_RegexParser/Utility/MissingUnicode.swift | 6 +++++ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4eb0ebea4..b595f3d29 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -550,28 +550,12 @@ extension Source { ) throws -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } - func isWhitespace(_ c: Character) -> Bool { - // This is a list of characters that PCRE treats as whitespace when - // compiled with Unicode support. It is a subset of the characters with - // the `.isWhitespace` property. ICU appears to also follow this list. - // Oniguruma and .NET follow a subset of this list. - // - // FIXME: PCRE only treats space and tab characters as whitespace when - // inside a custom character class (and only treats whitespace as - // non-semantic there for the extra-extended `(?xx)` mode). If we get a - // strict-PCRE mode, we'll need to add a case for that. - switch c { - case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r - "\u{85}", "\u{200E}", // next line, left-to-right mark - "\u{200F}", "\u{2028}", // right-to-left-mark, line separator - "\u{2029}": // paragraph separator - return true - default: - return false - } - } + // FIXME: PCRE only treats space and tab characters as whitespace when + // inside a custom character class (and only treats whitespace as + // non-semantic there for the extra-extended `(?xx)` mode). If we get a + // strict-PCRE mode, we'll need to add a case for that. let trivia: Located? = recordLoc { src in - src.tryEatPrefix(isWhitespace)?.string + src.tryEatPrefix(\.isPatternWhitespace)?.string } guard let trivia = trivia else { return nil } return AST.Trivia(trivia) diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift index dccba3286..4d819806b 100644 --- a/Sources/_RegexParser/Utility/MissingUnicode.swift +++ b/Sources/_RegexParser/Utility/MissingUnicode.swift @@ -660,6 +660,12 @@ extension Character { public var isOctalDigit: Bool { ("0"..."7").contains(self) } public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } + + /// Whether this character represents whitespace for the purposes of pattern + /// parsing. + public var isPatternWhitespace: Bool { + return unicodeScalars.first!.properties.isPatternWhitespace + } } extension UnicodeScalar { From a96648badd28106b4db723aca44b1f83fa956ffe Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 3/7] Rename endOfString -> unterminated --- .../Regex/Parse/DelimiterLexing.swift | 8 ++++---- Sources/_RegexParser/Regex/Parse/Mocking.swift | 2 +- Tests/RegexTests/ParseTests.swift | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index 1227ade1f..e88c1fa80 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -41,7 +41,7 @@ enum Delimiter: Hashable, CaseIterable { struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { - case endOfString + case unterminated case invalidUTF8 // TODO: better range reporting case unknownDelimiter case unprintableASCII @@ -59,7 +59,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { var description: String { switch kind { - case .endOfString: return "unterminated regex literal" + case .unterminated: return "unterminated regex literal" case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" case .unprintableASCII: return "unprintable ASCII character found in source file" @@ -238,7 +238,7 @@ fileprivate struct DelimiterLexer { /// the end of the buffer is reached. mutating func advance(escaped: Bool = false) throws { guard let next = load() else { - throw DelimiterLexError(.endOfString, resumeAt: cursor) + throw DelimiterLexError(.unterminated, resumeAt: cursor) } switch UnicodeScalar(next) { case let next where !next.isASCII: @@ -249,7 +249,7 @@ fileprivate struct DelimiterLexer { advanceCursor() case "\n", "\r": - throw DelimiterLexError(.endOfString, resumeAt: cursor) + throw DelimiterLexError(.unterminated, resumeAt: cursor) case "\0": // TODO: Warn to match the behavior of String literal lexer? Or should diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift index 5994a4f52..596a59bf4 100644 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift @@ -62,7 +62,7 @@ func libswiftLexRegexLiteral( curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) switch error.kind { - case .endOfString: + case .unterminated: // Missing closing delimiter can be recovered from. return false case .unprintableASCII, .invalidUTF8: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f6f31c075..649ea22e2 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2079,21 +2079,21 @@ extension RegexTests { // MARK: Printable ASCII - delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated) for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) } - delimiterLexingDiagnosticTest("re'\n'", .endOfString) - delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\n'", .unterminated) + delimiterLexingDiagnosticTest("re'\r'", .unterminated) delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) // MARK: Delimiter skipping - delimiterLexingDiagnosticTest("re'(?''", .endOfString) - delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) - delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) - delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) - delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest("re'(?''", .unterminated) + delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated) + delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated) } func testlibswiftDiagnostics() { From 120ffc90de110ed3e2d1af382cb2f0f093e340da Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 4/7] Fix end-of-line-comment lexing Previously we would just lex to the end of the input, as it was assumed only single-line regex would be supported. Update the implementation to handle multi-line, and take account of PCRE global options. --- .../Regex/AST/MatchingOptions.swift | 3 +- .../Regex/Parse/LexicalAnalysis.swift | 26 ++- Sources/_RegexParser/Regex/Parse/Parse.swift | 13 ++ Sources/_RegexParser/Regex/Parse/Source.swift | 6 + Tests/RegexTests/ParseTests.swift | 208 ++++++++++++++++++ 5 files changed, 250 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index 25cb10842..808b51287 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -137,7 +137,8 @@ extension AST { /// Global matching option specifiers. Unlike `MatchingOptionSequence`, /// these must appear at the start of the pattern, and apply globally. public struct GlobalMatchingOption: _ASTNode, Hashable { - /// Determines the definition of a newline for the '.' character class. + /// Determines the definition of a newline for the '.' character class and + /// when parsing end-of-line comments. public enum NewlineMatching: Hashable { /// (*CR*) case carriageReturnOnly diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index b595f3d29..165e97d1a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -528,11 +528,27 @@ extension Source { return try src.expectQuoted(endingWith: "*/").value } if context.endOfLineComments, src.tryEat("#") { - // TODO: If we ever support multi-line regex literals, this will need - // to be updated to stop at a newline. Note though that PCRE specifies - // that the newline it matches against can be controlled by the global - // matching options e.g `(*CR)`, `(*ANY)`, ... - return src.lexUntil(\.isEmpty).value + // Try eat until we either exhaust the input, or hit a newline. Note + // that the definition of newline can be altered depending on the global + // matching options. By default we consider a newline to be `\n` or + // `\r`. + return src.lexUntil { src in + if src.isEmpty { return true } + switch context.newlineMode { + case .carriageReturnOnly: + return src.tryEat("\r") + case .linefeedOnly: + return src.tryEat("\n") + case .carriageAndLinefeedOnly: + return src.tryEat("\r\n") + case .anyCarriageReturnOrLinefeed: + return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil + case .anyUnicode: + return src.tryEat(where: \.isNewline) + case .nulCharacter: + return src.tryEat("\0") + } + }.value } return nil } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 7867073e6..2512f9bf2 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -76,6 +76,10 @@ struct ParsingContext { /// The syntax options currently set. fileprivate(set) var syntax: SyntaxOptions + /// The current newline matching mode. + fileprivate(set) var newlineMode: AST.GlobalMatchingOption.NewlineMatching + = .anyCarriageReturnOrLinefeed + fileprivate mutating func recordGroup(_ g: AST.Group.Kind) { // TODO: Needs to track group number resets (?|...). priorGroupCount += 1 @@ -139,6 +143,15 @@ extension Parser { // First parse any global matching options if present. let opts = try source.lexGlobalMatchingOptionSequence() + // If we have a newline mode global option, update the context accordingly. + if let opts = opts { + for opt in opts.options.reversed() { + guard case .newlineMatching(let newline) = opt.kind else { continue } + context.newlineMode = newline + break + } + } + // Then parse the root AST node. let ast = try parseNode() guard source.isEmpty else { diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift index ddf0475f3..6eac16395 100644 --- a/Sources/_RegexParser/Regex/Parse/Source.swift +++ b/Sources/_RegexParser/Regex/Parse/Source.swift @@ -68,6 +68,12 @@ extension Source { return true } + mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool { + guard let next = peek(), try pred(next) else { return false } + advance() + return true + } + mutating func tryEat(sequence c: C) -> Bool where C.Element == Char { guard _slice.starts(with: c) else { return false } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 649ea22e2..b185234a0 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1526,6 +1526,214 @@ extension RegexTests { matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) ) + // Test multi-line comment handling. + parseTest( + """ + # a + bc # d + ef# g + # h + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r\ + bc # d\r\ + ef# g\r\ + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r\ + bc # d\r\ + ef# g\r\ + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r + bc # d\r + ef# g\r + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\n\r\ + bc # d\n\r\ + ef# g\n\r\ + # h\n\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR) + # a + bc # d + ef# g + # h + """, + ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR)\r\ + # a\r\ + bc # d\r\ + ef# g\r\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*LF) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CRLF) + # a + bc # d + ef# g + # h + """, + ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CRLF) + # a\r + bc # d\r + ef# g\r + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a\r\ + bc # d\r\ + ef# g\r\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a\r + bc # d\r + ef# g\r + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANY) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\u{2028}\ + bc # d + ef# g\u{2028}\ + # h + """, + concat("e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANY) + # a\u{2028}\ + bc # d\u{2028}\ + ef# g\u{2028}\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*NUL) + # a + bc # d\0\ + ef# g + # h + """, + ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*NUL) + # a\0\ + bc # d\0\ + ef# g\0\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR)(*NUL) + # a\0\ + bc # d\0\ + ef# g\0\ + # h + """, + ast(concat("b", "c", "e", "f"), + opts: .newlineMatching(.carriageReturnOnly), + .newlineMatching(.nulCharacter) + ), + syntax: .extendedSyntax + ) + // MARK: Parse with delimiters parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) From 4944fbea80d5abbbcc2bc03cc511868aebae949e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 5/7] Lex extended pound delimiters Start lexing `/.../`, and allow any number of pound signs to surround it. --- .../Regex/Parse/DelimiterLexing.swift | 152 +++++++++++++----- Tests/RegexTests/LexTests.swift | 25 +-- Tests/RegexTests/ParseTests.swift | 12 ++ 3 files changed, 143 insertions(+), 46 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index e88c1fa80..fa6ca978a 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -11,27 +11,27 @@ // TODO: mock up multi-line soon -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - case rxSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - case .rxSingleQuote: return ("rx'", "'") - } +struct Delimiter: Hashable { + let kind: Kind + let poundCount: Int + + init(_ kind: Kind, poundCount: Int) { + precondition(kind.allowsExtendedPoundSyntax || poundCount == 0) + self.kind = kind + self.poundCount = poundCount + } + + var opening: String { + String(repeating: "#", count: poundCount) + kind.opening + } + var closing: String { + kind.closing + String(repeating: "#", count: poundCount) } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } /// The default set of syntax options that the delimiter indicates. var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: + switch kind { + case .forwardSlash, .reSingleQuote: return .traditional case .experimental, .rxSingleQuote: return .experimental @@ -39,6 +39,37 @@ enum Delimiter: Hashable, CaseIterable { } } +extension Delimiter { + enum Kind: Hashable, CaseIterable { + case forwardSlash + case experimental + case reSingleQuote + case rxSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .forwardSlash: return ("/", "/") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// Whether or not extended pound syntax e.g `##/.../##` is allowed with + /// this delimiter. + var allowsExtendedPoundSyntax: Bool { + switch self { + case .forwardSlash: + return true + case .experimental, .reSingleQuote, .rxSingleQuote: + return false + } + } + } +} + struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { case unterminated @@ -120,16 +151,25 @@ fileprivate struct DelimiterLexer { precondition(cursor <= end, "Cannot advance past end") } - /// Check to see if a UTF-8 sequence can be eaten from the current cursor. - func canEat(_ utf8: String.UTF8View) -> Bool { - guard let slice = slice(utf8.count) else { return false } - return slice.elementsEqual(utf8) + /// Check to see if a byte sequence can be eaten from the current cursor. + func canEat(_ bytes: C) -> Bool where C.Element == UInt8 { + guard let slice = slice(bytes.count) else { return false } + return slice.elementsEqual(bytes) + } + + /// Attempt to eat a byte sequence, returning `true` if successful. + mutating func tryEat( + _ bytes: C + ) -> Bool where C.Element == UInt8 { + guard canEat(bytes) else { return false } + advanceCursor(bytes.count) + return true } - /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. - mutating func tryEat(_ utf8: String.UTF8View) -> Bool { - guard canEat(utf8) else { return false } - advanceCursor(utf8.count) + /// Attempt to eat an ascii scalar, returning `true` if successful. + mutating func tryEat(ascii s: Unicode.Scalar) -> Bool { + guard load() == ascii(s) else { return false } + advanceCursor() return true } @@ -137,8 +177,8 @@ fileprivate struct DelimiterLexer { /// the actual closing delimiter. mutating func trySkipDelimiter(_ delimiter: Delimiter) { // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. - switch delimiter { - case .traditional, .experimental: + switch delimiter.kind { + case .forwardSlash, .experimental: return case .reSingleQuote, .rxSingleQuote: break @@ -272,16 +312,42 @@ fileprivate struct DelimiterLexer { } } + mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? { + for kind in Delimiter.Kind.allCases { + // If the delimiter allows extended pound syntax, or there are no pounds, + // we just need to lex it. + let opening = kind.opening.utf8 + if kind.allowsExtendedPoundSyntax || poundCount == 0 { + guard tryEat(opening) else { continue } + return Delimiter(kind, poundCount: poundCount) + } + + // The delimiter doesn't allow extended pound syntax, so the pounds must be + // part of the delimiter. + guard + poundCount < opening.count, + opening.prefix(poundCount) + .elementsEqual(repeatElement(ascii("#"), count: poundCount)), + tryEat(opening.dropFirst(poundCount)) + else { continue } + + return Delimiter(kind, poundCount: 0) + } + return nil + } + /*consuming*/ mutating func lex( ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + // We can consume any number of pound signs. + var poundCount = 0 + while tryEat(ascii: "#") { + poundCount += 1 + } // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { + guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else { throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) } - let contentsStart = cursor while true { // Check to see if we're at a character that looks like a delimiter, but @@ -302,20 +368,34 @@ fileprivate struct DelimiterLexer { /// Drop a set of regex delimiters from the input string, returning the contents /// and the delimiter used. The input string must have valid delimiters. func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - func stripDelimiter(_ delim: Delimiter) -> String? { + func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? { + var slice = str.utf8[...] + + // Try strip any number of opening '#'s. + var poundCount = 0 + if kind.allowsExtendedPoundSyntax { + poundCount = slice.prefix(while: { + $0 == UInt8(("#" as UnicodeScalar).value) + }).count + slice = slice.dropFirst(poundCount) + } + // The opening delimiter must match. - guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + guard var slice = slice.tryDropPrefix(kind.opening.utf8) else { return nil } // The closing delimiter may optionally match, as it may not be present in // invalid code. + let delim = Delimiter(kind, poundCount: poundCount) if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { slice = newSlice } - return String(slice) + let result = String(decoding: slice, as: UTF8.self) + precondition(result.utf8.elementsEqual(slice)) + return (result, delim) } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { + for kind in Delimiter.Kind.allCases { + if let (contents, d) = stripDelimiter(kind) { return (contents, d) } } diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index c50191d05..d11be6c34 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -101,26 +101,31 @@ extension RegexTests { func testCompilerInterface() { + func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter { + Delimiter(kind, poundCount: poundCount) + } let testCases: [(String, (String, Delimiter)?)] = [ - ("#/abc/#", ("abc", .traditional)), - ("#|abc|#", ("abc", .experimental)), + ("/abc/", ("abc", delim(.forwardSlash))), + ("#/abc/#", ("abc", delim(.forwardSlash, poundCount: 1))), + ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))), + ("#|abc|#", ("abc", delim(.experimental))), // TODO: Null characters are lexically valid, similar to string literals, // but we ought to warn the user about them. - ("#|ab\0c|#", ("ab\0c", .experimental)), + ("#|ab\0c|#", ("ab\0c", delim(.experimental))), ("'abc'", nil), - ("#/abc/def/#", ("abc/def", .traditional)), - ("#|abc|def|#", ("abc|def", .experimental)), - ("#/abc\\/#def/#", ("abc\\/#def", .traditional)), - ("#|abc\\|#def|#", ("abc\\|#def", .experimental)), - ("#/abc|#def/#", ("abc|#def", .traditional)), - ("#|abc/#def|#", ("abc/#def", .experimental)), + ("#/abc/def/#", ("abc/def", delim(.forwardSlash, poundCount: 1))), + ("#|abc|def|#", ("abc|def", delim(.experimental))), + ("#/abc\\/#def/#", ("abc\\/#def", delim(.forwardSlash, poundCount: 1))), + ("#|abc\\|#def|#", ("abc\\|#def", delim(.experimental))), + ("#/abc|#def/#", ("abc|#def", delim(.forwardSlash, poundCount: 1))), + ("#|abc/#def|#", ("abc/#def", delim(.experimental))), ("#/abc|#def/", nil), ("#|abc/#def#", nil), ("#/abc\n/#", nil), ("#/abc\r/#", nil), - (#"re'abcre\''"#, (#"abcre\'"#, .reSingleQuote)), + (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))), (#"re'\'"#, nil) ] diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b185234a0..c4f13ffd9 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1736,7 +1736,9 @@ extension RegexTests { // MARK: Parse with delimiters + parseWithDelimitersTest("/a b/", concat("a", " ", "b")) parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) + parseWithDelimitersTest("##/a b/##", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) @@ -1773,6 +1775,11 @@ extension RegexTests { // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + // Make sure we can handle a combining accent as first character. + parseWithDelimitersTest("/\u{301}/", "\u{301}") + + delimiterLexingTest("/a/#", ignoreTrailing: true) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2302,6 +2309,11 @@ extension RegexTests { delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated) delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated) delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated) + + // MARK: Unbalanced extended syntax + delimiterLexingDiagnosticTest("#/a/", .unterminated) + delimiterLexingDiagnosticTest("##/a/#", .unterminated) + } func testlibswiftDiagnostics() { From 9f42ea4ce07194030e63ec104438a0bf4d9e12bd Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:53 +0100 Subject: [PATCH 6/7] Introduce a multi-line literal mode When an extended delimiter `#/` is followed by a newline, enter a multi-line mode where the literal may span multiple lines, and extended syntax is enabled by default. --- .../Regex/Parse/DelimiterLexing.swift | 67 +++++++-- .../Regex/Parse/Diagnostics.swift | 4 + .../Regex/Parse/LexicalAnalysis.swift | 8 +- .../_RegexParser/Regex/Parse/Mocking.swift | 4 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 46 +++++-- .../Regex/Parse/SyntaxOptions.swift | 5 + Tests/RegexTests/LexTests.swift | 5 + Tests/RegexTests/ParseTests.swift | 127 ++++++++++++++++++ 8 files changed, 239 insertions(+), 27 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index fa6ca978a..a9f92ade3 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -9,8 +9,6 @@ // //===----------------------------------------------------------------------===// -// TODO: mock up multi-line soon - struct Delimiter: Hashable { let kind: Kind let poundCount: Int @@ -28,13 +26,13 @@ struct Delimiter: Hashable { kind.closing + String(repeating: "#", count: poundCount) } - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { + /// Whether or not multi-line mode is permitted. + var allowsMultiline: Bool { switch kind { - case .forwardSlash, .reSingleQuote: - return .traditional - case .experimental, .rxSingleQuote: - return .experimental + case .forwardSlash: + return poundCount > 0 + case .experimental, .reSingleQuote, .rxSingleQuote: + return false } } } @@ -76,6 +74,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case invalidUTF8 // TODO: better range reporting case unknownDelimiter case unprintableASCII + case multilineClosingNotOnNewline } var kind: Kind @@ -94,6 +93,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" case .unprintableASCII: return "unprintable ASCII character found in source file" + case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line" } } } @@ -103,6 +103,9 @@ fileprivate struct DelimiterLexer { var cursor: UnsafeRawPointer let end: UnsafeRawPointer + var firstNewline: UnsafeRawPointer? + var isMultiline: Bool { firstNewline != nil } + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { precondition(start <= end) self.start = start @@ -262,12 +265,23 @@ fileprivate struct DelimiterLexer { let contentsEnd = cursor guard tryEat(delimiter.closing.utf8) else { return nil } - // Form a string from the contents and make sure it's valid UTF-8. let count = contentsEnd - contentsStart let contents = UnsafeRawBufferPointer( start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) + // In multi-line mode, we must be on a new line. So scan backwards and make + // sure we only have whitespace until the newline. + if isMultiline { + let idx = contents.lastIndex( + where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1 + guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") }) + else { + throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor) + } + } + + // Form a string from the contents and make sure it's valid UTF-8. + let s = String(decoding: contents, as: UTF8.self) guard s.utf8.elementsEqual(contents) else { throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) } @@ -278,7 +292,10 @@ fileprivate struct DelimiterLexer { /// the end of the buffer is reached. mutating func advance(escaped: Bool = false) throws { guard let next = load() else { - throw DelimiterLexError(.unterminated, resumeAt: cursor) + // We've hit the end of the buffer. In multi-line mode, we don't want to + // skip over what is likely otherwise valid Swift code, so resume from the + // first newline. + throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor) } switch UnicodeScalar(next) { case let next where !next.isASCII: @@ -289,7 +306,10 @@ fileprivate struct DelimiterLexer { advanceCursor() case "\n", "\r": - throw DelimiterLexError(.unterminated, resumeAt: cursor) + guard isMultiline else { + throw DelimiterLexError(.unterminated, resumeAt: cursor) + } + advanceCursor() case "\0": // TODO: Warn to match the behavior of String literal lexer? Or should @@ -301,8 +321,12 @@ fileprivate struct DelimiterLexer { advanceCursor() try advance(escaped: true) - case let next where !next.isPrintableASCII: + case let next + where !next.isPrintableASCII && !(isMultiline && next == "\t"): // Diagnose unprintable ASCII. + // Note that tabs are allowed in multi-line literals. + // TODO: This matches the string literal behavior, but should we allow + // tabs for single-line regex literals too? // TODO: Ideally we would recover and continue to lex until the ending // delimiter. throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) @@ -349,6 +373,23 @@ fileprivate struct DelimiterLexer { throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) } let contentsStart = cursor + + // If the delimiter allows multi-line, try skipping over any whitespace to a + // newline character. If we can do that, we enter multi-line mode. + if delimiter.allowsMultiline { + while let next = load() { + switch next { + case ascii(" "), ascii("\t"): + advanceCursor() + continue + case ascii("\n"), ascii("\r"): + firstNewline = cursor + default: + break + } + break + } + } while true { // Check to see if we're at a character that looks like a delimiter, but // likely isn't. In such a case, we can attempt to skip over it. diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d4c809045..621d6ea11 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -70,6 +70,8 @@ enum ParseError: Error, Hashable { case cannotRemoveTextSegmentOptions case cannotRemoveSemanticsOptions + case cannotRemoveExtendedSyntaxInMultilineMode + case expectedCalloutArgument } @@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible { return "text segment mode cannot be unset, only changed" case .cannotRemoveSemanticsOptions: return "semantic level cannot be unset, only changed" + case .cannotRemoveExtendedSyntaxInMultilineMode: + return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 165e97d1a..c48d53de9 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -657,6 +657,7 @@ extension Source { /// | MatchingOption* '-' MatchingOption* /// mutating func lexMatchingOptionSequence( + context: ParsingContext ) throws -> AST.MatchingOptionSequence? { let ateCaret = recordLoc { $0.tryEat("^") } @@ -691,6 +692,11 @@ extension Source { if opt.isSemanticMatchingLevel { throw ParseError.cannotRemoveSemanticsOptions } + // Extended syntax may not be removed if in multi-line mode. + if context.syntax.contains(.multilineExtendedSyntax) && + opt.isAnyExtended { + throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode + } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, @@ -864,7 +870,7 @@ extension Source { } // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). - if let seq = try src.lexMatchingOptionSequence() { + if let seq = try src.lexMatchingOptionSequence(context: context) { if src.tryEat(":") { return .changeMatchingOptions(seq, isIsolated: false) } diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift index 596a59bf4..dd02e0fc7 100644 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift @@ -62,8 +62,8 @@ func libswiftLexRegexLiteral( curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) switch error.kind { - case .unterminated: - // Missing closing delimiter can be recovered from. + case .unterminated, .multilineClosingNotOnNewline: + // These can be recovered from. return false case .unprintableASCII, .invalidUTF8: // We don't currently have good recovery behavior for these. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 2512f9bf2..c3aa3500b 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -288,22 +288,25 @@ extension Parser { ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. + // Check if we're introducing or removing extended syntax. We skip this for + // multi-line, as extended syntax is always enabled there. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) + if !context.syntax.contains(.multilineExtendedSyntax) { + if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { + if c.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if c.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if c.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } } } defer { @@ -532,11 +535,32 @@ public func parse( return try parser.parse() } +/// Retrieve the default set of syntax options that a delimiter and literal +/// contents indicates. +fileprivate func defaultSyntaxOptions( + _ delim: Delimiter, contents: String +) -> SyntaxOptions { + switch delim.kind { + case .forwardSlash: + // For an extended syntax forward slash e.g #/.../#, extended syntax is + // permitted if it spans multiple lines. + if delim.poundCount > 0 && + contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) { + return .multilineExtendedSyntax + } + return .traditional + case .reSingleQuote: + return .traditional + case .experimental, .rxSingleQuote: + return .experimental + } +} + /// Parse a given regex string with delimiters, inferring the syntax options /// from the delimiter used. public func parseWithDelimiters( _ regex: S ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) - return try parse(contents, delim.defaultSyntaxOptions) + return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) } diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index 5135d8ec1..b7c09ea1c 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -58,6 +58,11 @@ public struct SyntaxOptions: OptionSet { /// `(_: .*)` == `(?:.*)` public static var experimentalCaptures: Self { Self(1 << 5) } + /// The default syntax for a multi-line regex literal. + public static var multilineExtendedSyntax: Self { + return [Self(1 << 6), .extendedSyntax] + } + /* /// `*` == `[[:digit:]]*` == `\d*` diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index d11be6c34..5c304fe58 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -110,6 +110,11 @@ extension RegexTests { ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))), ("#|abc|#", ("abc", delim(.experimental))), + // Multiline + ("#/\na\nb\n/#", ("\na\nb\n", delim(.forwardSlash, poundCount: 1))), + ("#/ \na\nb\n /#", (" \na\nb\n ", delim(.forwardSlash, poundCount: 1))), + ("##/ \na\nb\n /##", (" \na\nb\n ", delim(.forwardSlash, poundCount: 2))), + // TODO: Null characters are lexically valid, similar to string literals, // but we ought to warn the user about them. ("#|ab\0c|#", ("ab\0c", delim(.experimental))), diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c4f13ffd9..c40cb86ca 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -223,6 +223,36 @@ func diagnosticTest( } } +func diagnosticWithDelimitersTest( + _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) + + do { + let orig = try parseWithDelimiters(literal) + let ast = orig.root + XCTFail(""" + + Passed \(ast) + But expected error: \(expected) + """, file: file, line: line) + } catch let e as Source.LocatedError { + guard e.error == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.error) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Error without source location: \(e)", file: file, line: line) + } +} + func delimiterLexingDiagnosticTest( _ input: String, _ expected: DelimiterLexError.Kind, syntax: SyntaxOptions = .traditional, @@ -1403,6 +1433,18 @@ extension RegexTests { parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( adding: .extraExtended), isIsolated: true, concat(" "))) + parseTest( + "(?x) a (?^) b", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat( + "a", + changeMatchingOptions( + unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) + ) + ) + ) + // End of line comments aren't applicable in custom char classes. // TODO: ICU supports this. parseTest( @@ -1780,6 +1822,56 @@ extension RegexTests { delimiterLexingTest("/a/#", ignoreTrailing: true) + // MARK: Multiline + + parseWithDelimitersTest("#/\n/#", empty()) + parseWithDelimitersTest("#/\r/#", empty()) + parseWithDelimitersTest("#/\r\n/#", empty()) + parseWithDelimitersTest("#/\n\t\t /#", empty()) + parseWithDelimitersTest("#/ \t\t\n\t\t /#", empty()) + + parseWithDelimitersTest("#/\n a \n/#", "a") + parseWithDelimitersTest("#/\r a \r/#", "a") + parseWithDelimitersTest("#/\r\n a \r\n/#", "a") + parseWithDelimitersTest("#/\n a \n\t\t /#", "a") + parseWithDelimitersTest("#/\t \n a \n\t\t /#", "a") + + parseWithDelimitersTest(""" + #/ + a + b + c + /# + """, concat("a", "b", "c")) + + parseWithDelimitersTest(""" + #/ + a # comment + b # another + # + /# + """, concat("a", "b")) + + // Make sure (?^) is ignored. + parseWithDelimitersTest(""" + #/ + (?^) + # comment + /# + """, changeMatchingOptions( + unsetMatchingOptions(), isIsolated: true, empty()) + ) + + // (?x) has no effect. + parseWithDelimitersTest(""" + #/ + (?x) + # comment + /# + """, changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, empty()) + ) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2162,6 +2254,32 @@ extension RegexTests { diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + // Extended syntax may not be removed in multi-line mode. + diagnosticWithDelimitersTest(""" + #/ + (?-x)a b + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx)a b + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-x:a b) + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx:a b) + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*")) @@ -2314,6 +2432,15 @@ extension RegexTests { delimiterLexingDiagnosticTest("#/a/", .unterminated) delimiterLexingDiagnosticTest("##/a/#", .unterminated) + // MARK: Multiline + + // Can only be done if pound signs are used. + delimiterLexingDiagnosticTest("/\n/", .unterminated) + + // Opening and closing delimiters must be on a newline. + delimiterLexingDiagnosticTest("#/a\n/#", .unterminated) + delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline) + delimiterLexingDiagnosticTest("#/\n#/#", .multilineClosingNotOnNewline) } func testlibswiftDiagnostics() { From 556bca0abb2bd1623664481f9aa31be0ed19af1f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:53 +0100 Subject: [PATCH 7/7] Disable unused delimiters Leave only `/.../` (and its extended syntax) enabled for now. --- .../Regex/Parse/DelimiterLexing.swift | 21 ++++++++++++---- Tests/RegexTests/LexTests.swift | 24 +++++++++++++++++-- Tests/RegexTests/ParseTests.swift | 6 +++-- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index a9f92ade3..bee782043 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -35,6 +35,12 @@ struct Delimiter: Hashable { return false } } + + /// The delimiters which are currently enabled. + static var enabledDelimiters: [Kind] { [.forwardSlash] } + + /// All known delimiters. + static var allDelimiters: [Kind] { Kind.allCases } } extension Delimiter { @@ -106,11 +112,15 @@ fileprivate struct DelimiterLexer { var firstNewline: UnsafeRawPointer? var isMultiline: Bool { firstNewline != nil } - init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + let delimiters: [Delimiter.Kind] + + init(start: UnsafeRawPointer, end: UnsafeRawPointer, + delimiters: [Delimiter.Kind]) { precondition(start <= end) self.start = start self.cursor = start self.end = end + self.delimiters = delimiters } func ascii(_ s: Unicode.Scalar) -> UInt8 { @@ -337,7 +347,7 @@ fileprivate struct DelimiterLexer { } mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? { - for kind in Delimiter.Kind.allCases { + for kind in delimiters { // If the delimiter allows extended pound syntax, or there are no pounds, // we just need to lex it. let opening = kind.opening.utf8 @@ -435,7 +445,7 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { precondition(result.utf8.elementsEqual(slice)) return (result, delim) } - for kind in Delimiter.Kind.allCases { + for kind in Delimiter.allDelimiters { if let (contents, d) = stripDelimiter(kind) { return (contents, d) } @@ -446,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { /// Attempt to lex a regex literal between `start` and `end`, returning either /// the contents and pointer from which to resume lexing, or an error. func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer + start: UnsafeRawPointer, end: UnsafeRawPointer, + delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - var lexer = DelimiterLexer(start: start, end: end) + var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters) return try lexer.lex() } diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 5c304fe58..958c53c26 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -100,7 +100,7 @@ extension RegexTests { } - func testCompilerInterface() { + func testCompilerInterface() throws { func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter { Delimiter(kind, poundCount: poundCount) } @@ -138,7 +138,9 @@ extension RegexTests { input.withCString { let endPtr = $0 + input.utf8.count assert(endPtr.pointee == 0) - guard let out = try? lexRegex(start: $0, end: endPtr) else { + guard let out = try? lexRegex( + start: $0, end: endPtr, delimiters: Delimiter.allDelimiters) + else { XCTAssertNil(expected) return } @@ -150,5 +152,23 @@ extension RegexTests { XCTAssertEqual(expected?.1, droppedDelimiters.1) } } + + // TODO: Remove the lexing code for these if we no longer need them. + let disabledDelimiters: [String] = [ + "#|x|#", "re'x'", "rx'y'" + ] + + for input in disabledDelimiters { + try input.withCString { + let endPtr = $0 + input.utf8.count + assert(endPtr.pointee == 0) + do { + _ = try lexRegex(start: $0, end: endPtr) + XCTFail() + } catch let e as DelimiterLexError { + XCTAssertEqual(e.kind, .unknownDelimiter) + } + } + } } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c40cb86ca..c6ff3e46d 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -117,7 +117,8 @@ func delimiterLexingTest( ) -> String { input.withCString(encodedAs: UTF8.self) { ptr in let endPtr = ptr + input.utf8.count - let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + let (contents, delim, end) = try! lexRegex( + start: ptr, end: endPtr, delimiters: Delimiter.allDelimiters) if ignoreTrailing { XCTAssertNotEqual(end, endPtr, file: file, line: line) } else { @@ -260,7 +261,8 @@ func delimiterLexingDiagnosticTest( ) { do { _ = try input.withCString { ptr in - try lexRegex(start: ptr, end: ptr + input.count) + try lexRegex( + start: ptr, end: ptr + input.count, delimiters: Delimiter.allDelimiters) } XCTFail(""" Passed, but expected error: \(expected)