From 57a50926abe9698ce58428ae89e830a2ad6f68b9 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 26 May 2022 17:49:27 +0100 Subject: [PATCH] Allow unbounded quoted sequences `\Q...` PCRE and ICU both support quoted sequences that don't have a terminating `\E`. Update the parsing to allow this. Additionally, allow empty quoted sequences outside of custom character classes, which is consistent with ICU. Finally, don't allow quoted sequences to span multiple lines in extended syntax literals. --- .../Regex/Parse/Diagnostics.swift | 4 +++ .../Regex/Parse/LexicalAnalysis.swift | 19 +++++++++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 10 ++++-- Tests/RegexTests/ParseTests.swift | 33 +++++++++++++++++-- 4 files changed, 60 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 479604582..ee17f209f 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -44,6 +44,8 @@ enum ParseError: Error, Hashable { case invalidEscape(Character) case confusableCharacter(Character) + case quoteMayNotSpanMultipleLines + case cannotReferToWholePattern case quantifierRequiresOperand(String) @@ -138,6 +140,8 @@ extension ParseError: CustomStringConvertible { return "invalid escape sequence '\\\(c)'" case .confusableCharacter(let c): return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead" + case .quoteMayNotSpanMultipleLines: + return "quoted sequence may not span multiple lines in multi-line literal" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .quantifierRequiresOperand(let q): diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index a6dfa0ce9..bdf076e68 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -579,7 +579,7 @@ extension Source { /// Try to consume quoted content /// - /// Quote -> '\Q' (!'\E' .)* '\E' + /// Quote -> '\Q' (!'\E' .)* '\E'? /// /// With `SyntaxOptions.experimentalQuotes`, also accepts /// @@ -592,9 +592,24 @@ extension Source { mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { let str = try recordLoc { src -> String? in if src.tryEat(sequence: #"\Q"#) { - return try src.expectQuoted(endingWith: #"\E"#).value + let contents = src.lexUntil { src in + src.isEmpty || src.tryEat(sequence: #"\E"#) + }.value + + // In multi-line literals, the quote may not span multiple lines. + if context.syntax.contains(.multilineExtendedSyntax), + contents.spansMultipleLinesInRegexLiteral { + throw ParseError.quoteMayNotSpanMultipleLines + } + + // The sequence must not be empty in a custom character class. + if context.isInCustomCharacterClass && contents.isEmpty { + throw ParseError.expectedNonEmptyContents + } + return contents } if context.experimentalQuotes, src.tryEat("\"") { + // TODO: Can experimental quotes be empty? return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 84957220c..e540e6c1d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -592,6 +592,13 @@ public func parse( return ast } +extension String { + /// Whether the given string is considered multi-line for a regex literal. + var spansMultipleLinesInRegexLiteral: Bool { + unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) + } +} + /// Retrieve the default set of syntax options that a delimiter and literal /// contents indicates. fileprivate func defaultSyntaxOptions( @@ -601,8 +608,7 @@ fileprivate func defaultSyntaxOptions( case .forwardSlash: // For an extended syntax forward slash e.g #/.../#, extended syntax is // permitted if it spans multiple lines. - if delim.poundCount > 0 && - contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) { + if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral { return .multilineExtendedSyntax } return .traditional diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index dbdacb0c2..fbca83128 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -754,6 +754,14 @@ extension RegexTests { // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) + // ICU allows quotes to be empty outside of custom character classes. + parseTest(#"\Q\E"#, quote("")) + + // Quotes may be unterminated. + parseTest(#"\Qab"#, quote("ab")) + parseTest(#"\Q"#, quote("")) + parseTest("\\Qab\\", quote("ab\\")) + parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), @@ -2539,8 +2547,6 @@ extension RegexTests { diagnosticTest(#"(?P"#, .expected(")")) diagnosticTest(#"(?R"#, .expected(")")) - diagnosticTest(#"\Qab"#, .expected("\\E")) - diagnosticTest("\\Qab\\", .expected("\\E")) diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) @@ -2619,6 +2625,9 @@ extension RegexTests { // TODO: Custom diagnostic for missing '\Q' diagnosticTest(#"\E"#, .invalidEscape("E")) + diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents) + diagnosticTest(#"[\Q]"#, .expected("]")) + // PCRE treats these as octal, but we require a `0` prefix. diagnosticTest(#"[\1]"#, .invalidEscape("1")) diagnosticTest(#"[\123]"#, .invalidEscape("1")) @@ -2711,6 +2720,26 @@ extension RegexTests { """, .cannotRemoveExtendedSyntaxInMultilineMode ) + diagnosticWithDelimitersTest(#""" + #/ + \Q + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Qabc + \E + /# + """#, .quoteMayNotSpanMultipleLines) + + diagnosticWithDelimitersTest(#""" + #/ + \Q + /# + """#, .quoteMayNotSpanMultipleLines) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*"))