From 06dbc16a333d88fc50febfd98ad9c65c8f57be3d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:41 +0100 Subject: [PATCH 1/9] Introduce Source.lookahead Use this to replace the various places we're doing `var src = self`. --- .../Regex/Parse/LexicalAnalysis.swift | 86 +++++++++++-------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index cbc9d5eb2..936e6b189 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -149,6 +149,14 @@ extension Source { return result } + /// Perform a lookahead using a temporary source. Within the body of the + /// lookahead, any modifications to the source will not be reflected outside + /// the body. + func lookahead(_ body: (inout Source) throws -> T) rethrows -> T { + var src = self + return try body(&src) + } + /// Attempt to eat the given character, returning its source location if /// successful, `nil` otherwise. mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { @@ -1240,8 +1248,9 @@ extension Source { private func canLexPOSIXCharacterProperty() -> Bool { do { - var src = self - return try src.lexPOSIXCharacterProperty() != nil + return try lookahead { src in + try src.lexPOSIXCharacterProperty() != nil + } } catch { // We want to tend on the side of lexing a POSIX character property, so // even if it is invalid in some way (e.g invalid property names), still @@ -1394,10 +1403,11 @@ extension Source { /// Checks whether a numbered reference can be lexed. private func canLexNumberedReference() -> Bool { - var src = self - _ = src.tryEat(anyOf: "+", "-") - guard let next = src.peek() else { return false } - return RadixKind.decimal.characterFilter(next) + lookahead { src in + _ = src.tryEat(anyOf: "+", "-") + guard let next = src.peek() else { return false } + return RadixKind.decimal.characterFilter(next) + } } /// Eat a named reference up to a given closing delimiter. @@ -1587,53 +1597,55 @@ extension Source { /// Whether we can lex a group-like reference after the specifier '(?'. private func canLexGroupLikeReference() -> Bool { - var src = self - if src.tryEat("P") { - return src.tryEat(anyOf: "=", ">") != nil - } - if src.tryEat(anyOf: "&", "R") != nil { - return true + lookahead { src in + if src.tryEat("P") { + return src.tryEat(anyOf: "=", ">") != nil + } + if src.tryEat(anyOf: "&", "R") != nil { + return true + } + return src.canLexNumberedReference() } - return src.canLexNumberedReference() } private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { - var src = self - - // See if we can lex a matching option sequence that terminates in ')'. Such - // a sequence is an atom. If an error is thrown, there are invalid elements - // of the matching option sequence. In such a case, we can lex as a group - // and diagnose the invalid group kind. - guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { - return false + lookahead { src in + // See if we can lex a matching option sequence that terminates in ')'. + // Such a sequence is an atom. If an error is thrown, there are invalid + // elements of the matching option sequence. In such a case, we can lex as + // a group and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") } - return src.tryEat(")") } /// Whether a group specifier should be lexed as an atom instead of a group. private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { - var src = self - guard src.tryEat("(") else { return false } + lookahead { src in + guard src.tryEat("(") else { return false } - if src.tryEat("?") { - // The start of a reference '(?P=', '(?R', ... - if src.canLexGroupLikeReference() { return true } + if src.tryEat("?") { + // The start of a reference '(?P=', '(?R', ... + if src.canLexGroupLikeReference() { return true } - // The start of a PCRE callout. - if src.tryEat("C") { return true } + // The start of a PCRE callout. + if src.tryEat("C") { return true } - // The start of an Oniguruma 'of-contents' callout. - if src.tryEat("{") { return true } + // The start of an Oniguruma 'of-contents' callout. + if src.tryEat("{") { return true } - // A matching option atom (?x), (?i), ... - if src.canLexMatchingOptionsAsAtom(context: context) { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + + return false + } + // The start of a backreference directive or Oniguruma named callout. + if src.tryEat("*") { return true } return false } - // The start of a backreference directive or Oniguruma named callout. - if src.tryEat("*") { return true } - - return false } /// Consume an escaped atom, starting from after the backslash From 8242df6f15b544af526f3aa3872f46a619e39a3c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:42 +0100 Subject: [PATCH 2/9] Remove `throws` from a couple of lexing methods --- .../Regex/Parse/LexicalAnalysis.swift | 15 +++++---------- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 +++--- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 936e6b189..0d4928406 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -421,9 +421,7 @@ extension Source { ) throws -> (Located, Located, [AST.Trivia])? { var trivia: [AST.Trivia] = [] - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let amt: Located? = try recordLoc { src in if src.tryEat("*") { return .zeroOrMore } @@ -441,9 +439,7 @@ extension Source { guard let amt = amt else { return nil } // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let kind: Located = recordLoc { src in if src.tryEat("?") { return .reluctant } @@ -675,7 +671,7 @@ extension Source { /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set mutating func lexNonSemanticWhitespace( context: ParsingContext - ) throws -> AST.Trivia? { + ) -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } // FIXME: PCRE only treats space and tab characters as whitespace when @@ -707,7 +703,7 @@ extension Source { if let comment = try lexComment(context: context) { return comment } - if let whitespace = try lexNonSemanticWhitespace(context: context) { + if let whitespace = lexNonSemanticWhitespace(context: context) { return whitespace } return nil @@ -1186,8 +1182,7 @@ extension Source { } } - mutating func lexCustomCCStart( - ) throws -> Located? { + mutating func lexCustomCCStart() -> Located? { recordLoc { src in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index a3ddfb812..c31380c9d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -429,7 +429,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = source.lexCustomCCStart() { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -513,7 +513,7 @@ extension Parser { source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = source.lexCustomCCStart() { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } @@ -527,7 +527,7 @@ extension Parser { // Lex non-semantic whitespace if we're allowed. // TODO: ICU allows end-of-line comments in custom character classes, // which we ought to support if we want to support multi-line regex. - if let trivia = try source.lexNonSemanticWhitespace(context: context) { + if let trivia = source.lexNonSemanticWhitespace(context: context) { members.append(.trivia(trivia)) continue } From e80322bfdd92c6d047e8b83ac7f5435e61e8e6c9 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:42 +0100 Subject: [PATCH 3/9] Add ASTBuilder helper for char class set operations --- .../Utility/ASTBuilder.swift | 8 ++++ Tests/RegexTests/ParseTests.swift | 38 +++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 78477e2b5..1001d23f9 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -319,6 +319,14 @@ func charClass( return .custom(cc) } +func setOp( + _ lhs: AST.CustomCharacterClass.Member..., + op: AST.CustomCharacterClass.SetOp, + _ rhs: AST.CustomCharacterClass.Member... +) -> AST.CustomCharacterClass.Member { + .setOperation(lhs, .init(faking: op), rhs) +} + func quote(_ s: String) -> AST.Node { .quote(.init(s, .fake)) } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e36d84f60..d9f57269b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -522,9 +522,8 @@ extension RegexTests { parseTest("[a-a]", charClass(range_m("a", "a"))) parseTest("[B-a]", charClass(range_m("B", "a"))) - // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( - .setOperation([range_m("a", "d")], .init(faking: .subtraction), [range_m("a", "c")]) + setOp(range_m("a", "d"), op: .subtraction, range_m("a", "c")) )) parseTest("[-]", charClass("-")) @@ -684,30 +683,31 @@ extension RegexTests { parseTest( #"[a[bc]de&&[^bc]\d]+"#, - oneOrMore(of: charClass( - .setOperation( - ["a", charClass("b", "c"), "d", "e"], - .init(faking: .intersection), - [charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit))] - )))) + oneOrMore(of: charClass(setOp( + "a", charClass("b", "c"), "d", "e", + op: .intersection, + charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit)) + ))) + ) parseTest( - "[a&&b]", - charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"]))) + "[a&&b]", charClass(setOp("a", op: .intersection, "b")) + ) parseTest( "[abc--def]", - charClass(.setOperation(["a", "b", "c"], .init(faking: .subtraction), ["d", "e", "f"]))) + charClass(setOp("a", "b", "c", op: .subtraction, "d", "e", "f")) + ) // We left-associate for chained operators. parseTest( "[ab&&b~~cd]", - charClass( - .setOperation( - [.setOperation(["a", "b"], .init(faking: .intersection), ["b"])], - .init(faking: .symmetricDifference), - ["c", "d"]))) + charClass(setOp( + setOp("a", "b", op: .intersection, "b"), + op: .symmetricDifference, + "c", "d" + )) + ) // Operators are only valid in custom character classes. parseTest( @@ -723,11 +723,11 @@ extension RegexTests { parseTest( "[ && ]", - charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) + charClass(setOp(" ", op: .intersection, " ", " ")) ) parseTest("(?x)[ a && b ]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + charClass(setOp("a", op: .intersection, "b")) )) // MARK: Quotes From 1e57c5a11a2193879bd1268fcf479b3c67522506 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 4/9] Simplify character class parsing a little We don't have to handle bailing early, the loop will terminate if we don't lex another operator. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c31380c9d..1cdb2c84e 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -485,16 +485,7 @@ extension Parser { throw Source.LocatedError( ParseError.expectedCustomCharacterClassMembers, start.location) } - - // If we're done, bail early - let setOp = Member.setOperation(members, binOp, rhs) - if source.tryEat("]") { - return CustomCC( - start, [setOp], loc(start.location.start)) - } - - // Otherwise it's just another member to accumulate - members = [setOp] + members = [.setOperation(members, binOp, rhs)] } if members.none(\.isSemantic) { throw Source.LocatedError( From 95dc4873cd55982ce42edb51d159a0a88bb19cc4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 5/9] Dump the inverted bit of a custom character class Make sure an inverted character class does not dump the same as a regular character class. --- Sources/_RegexParser/Regex/Printing/DumpAST.swift | 2 +- Tests/RegexTests/ParseTests.swift | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 110b7b480..10e50d712 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -309,7 +309,7 @@ extension AST.CustomCharacterClass: _ASTNode { // comparisons of dumped output in tests. // TODO: We should eventually have some way of filtering out trivia for // tests, so that it can appear in regular dumps. - return "customCharacterClass(\(strippingTriviaShallow.members))" + return "customCharacterClass(inverted: \(isInverted), \(strippingTriviaShallow.members))" } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index d9f57269b..1b8375b14 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2210,6 +2210,8 @@ extension RegexTests { parseNotEqualTest(#"[abc]"#, #"[a b c]"#) + parseNotEqualTest("[abc]", "[^abc]") + parseNotEqualTest(#"\1"#, #"\10"#) parseNotEqualTest("(?^:)", ("(?-:)")) From 9d84967ffec8d11561aab96ac86e1f7e7acf6b94 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 6/9] Allow empty comments `expectQuoted` expects non-empty contents, which doesn't apply to comments. --- Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift | 4 ++-- Tests/RegexTests/ParseTests.swift | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 0d4928406..d13edde57 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -630,10 +630,10 @@ extension Source { mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in if src.tryEat(sequence: "(?#") { - return try src.expectQuoted(endingWith: ")").value + return try src.lexUntil(eating: ")").value } if context.experimentalComments, src.tryEat(sequence: "/*") { - return try src.expectQuoted(endingWith: "*/").value + return try src.lexUntil(eating: "*/").value } if context.endOfLineComments, src.tryEat("#") { // Try eat until we either exhaust the input, or hit a newline. Note diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1b8375b14..997f9530b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1648,6 +1648,9 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) + parseTest("(?#)", empty()) + parseTest("/**/", empty(), syntax: .experimental) + // MARK: Matching option changing parseTest( From 24b64cd3c8d08167b84879a8a6a2110d029e5f4b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 7/9] Lex whitespace in range quantifiers PCRE does not allow whitespace here, instead treating the sequence as literal if whitespace is present. However this behavior is quite unintuitive. Instead, lex whitespace between range operands. --- .../Regex/Parse/LexicalAnalysis.swift | 14 ++++++++++++-- Tests/RegexTests/MatchTests.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 16 +++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index d13edde57..f1038da57 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -430,7 +430,7 @@ extension Source { return try src.tryEating { src in guard src.tryEat("{"), - let range = try src.lexRange(context: context), + let range = try src.lexRange(context: context, trivia: &trivia), src.tryEat("}") else { return nil } return range.value @@ -456,11 +456,17 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange(context: ParsingContext) throws -> Located? { + mutating func lexRange( + context: ParsingContext, trivia: inout [AST.Trivia] + ) throws -> Located? { try recordLoc { src in try src.tryEating { src in + if let t = src.lexWhitespace() { trivia.append(t) } + let lowerOpt = try src.lexNumber() + if let t = src.lexWhitespace() { trivia.append(t) } + // ',' or '...' or '..<' or nothing // TODO: We ought to try and consume whitespace here and emit a // diagnostic for the user warning them that it would cause the range to @@ -480,11 +486,15 @@ extension Source { closedRange = nil } + if let t = src.lexWhitespace() { trivia.append(t) } + let upperOpt = try src.lexNumber()?.map { upper in // If we have an open range, the upper bound should be adjusted down. closedRange == true ? upper : upper - 1 } + if let t = src.lexWhitespace() { trivia.append(t) } + switch (lowerOpt, closedRange, upperOpt) { case let (l?, nil, nil): return .exactly(l) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 4ae489300..02a1c6c49 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -341,8 +341,12 @@ extension RegexTests { firstMatchTest( #"a{1,2}"#, input: "123aaaxyz", match: "aa") + firstMatchTest( + #"a{ 1 , 2 }"#, input: "123aaaxyz", match: "aa") firstMatchTest( #"a{,2}"#, input: "123aaaxyz", match: "") + firstMatchTest( + #"a{ , 2 }"#, input: "123aaaxyz", match: "") firstMatchTest( #"a{,2}x"#, input: "123aaaxyz", match: "aax") firstMatchTest( @@ -351,6 +355,8 @@ extension RegexTests { #"a{2,}"#, input: "123aaaxyz", match: "aaa") firstMatchTest( #"a{1}"#, input: "123aaaxyz", match: "a") + firstMatchTest( + #"a{ 1 }"#, input: "123aaaxyz", match: "a") firstMatchTest( #"a{1,2}?"#, input: "123aaaxyz", match: "a") firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 997f9530b..5d75eb361 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -832,6 +832,10 @@ extension RegexTests { #"a{1,1}"#, quantRange(1...1, of: "a")) + parseTest("x{3, 5}", quantRange(3 ... 5, of: "x")) + parseTest("x{ 3 , 5 }", quantRange(3 ... 5, of: "x")) + parseTest("x{3 }", exactly(3, of: "x")) + // Make sure ranges get treated as literal if invalid. parseTest("{", "{") parseTest("{,", concat("{", ",")) @@ -851,11 +855,6 @@ extension RegexTests { parseTest("x{+", concat("x", oneOrMore(of: "{"))) parseTest("x{6,+", concat("x", "{", "6", oneOrMore(of: ","))) - // TODO: We should emit a diagnostic for this. - parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}")) - parseTest("{3, 5}", concat("{", "3", ",", " ", "5", "}")) - parseTest("{3 }", concat("{", "3", " ", "}")) - // MARK: Groups // Named captures @@ -1771,10 +1770,10 @@ extension RegexTests { // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 - // TODO: We ought to warn on this, and produce a range anyway. + // We however do ignore it. parseTest("(?x)a{1, 3}", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - "a", "{", "1", ",", "3", "}" + quantRange(1 ... 3, of: "a") )) // Test that we cover the list of whitespace characters covered by PCRE. @@ -2695,6 +2694,9 @@ extension RegexTests { diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + diagnosticTest("{3, 5}", .quantifierRequiresOperand("{3, 5}")) + diagnosticTest("{3 }", .quantifierRequiresOperand("{3 }")) + // These are not quantifiable. diagnosticTest(#"\b?"#, .notQuantifiable) diagnosticTest(#"\B*"#, .notQuantifiable) From 8388d0f22f9270707df46684492c2ac5f36c9a1c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 8/9] Parse end-of-line comments in custom character classes Previously we would only parse non-semantic whitespace, but also expand to end-of-line comments, which are supported by ICU. --- .../Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 ++--- Tests/RegexTests/ParseTests.swift | 23 ++++++++++++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f1038da57..ff72fca01 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -639,7 +639,7 @@ extension Source { /// mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in - if src.tryEat(sequence: "(?#") { + if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") { return try src.lexUntil(eating: ")").value } if context.experimentalComments, src.tryEat(sequence: "/*") { diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 1cdb2c84e..3f5bfd69f 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -515,10 +515,8 @@ extension Parser { continue } - // Lex non-semantic whitespace if we're allowed. - // TODO: ICU allows end-of-line comments in custom character classes, - // which we ought to support if we want to support multi-line regex. - if let trivia = source.lexNonSemanticWhitespace(context: context) { + // Lex trivia if we're allowed. + if let trivia = try source.lexTrivia(context: context) { members.append(.trivia(trivia)) continue } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 5d75eb361..a599e8a1b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1677,12 +1677,8 @@ extension RegexTests { ) ) - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. - parseTest("(?x)[ # abc]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("#", "a", "b", "c") - )) + parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c")) + parseTest("[#]", charClass("#")) parseTest("(?x)a b c[d e f]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), @@ -2115,6 +2111,15 @@ extension RegexTests { /# """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + b-c #a + d] + /# + """#, charClass("a", range_m("b", "c"), "d")) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2544,6 +2549,12 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest("(?x)[(?#)]", .expected("]")) + diagnosticTest("(?x)[(?#abc)]", .expected("]")) + + diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) From 5b0524a9d172655a0d1488cf65c4b56def2445c5 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 9/9] Allow trivia between character class range operands Factor out the logic that deals with parsing an individual character class member, and interleave `lexTrivia` calls between range operand parsing. --- .../Regex/AST/CustomCharClass.swift | 12 ++- .../Regex/Parse/LexicalAnalysis.swift | 19 ++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 87 ++++++++++++------- .../Utility/ASTBuilder.swift | 2 +- Tests/RegexTests/MatchTests.swift | 2 + Tests/RegexTests/ParseTests.swift | 23 +++++ 6 files changed, 100 insertions(+), 45 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index d28490f25..087387c1e 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -51,11 +51,16 @@ extension AST { public var lhs: Atom public var dashLoc: SourceLocation public var rhs: Atom + public var trivia: [AST.Trivia] - public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) { + public init( + _ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom, + trivia: [AST.Trivia] + ) { self.lhs = lhs self.dashLoc = dashLoc self.rhs = rhs + self.trivia = trivia } } public enum SetOp: String, Hashable { @@ -95,6 +100,11 @@ extension CustomCC.Member { return false } + public var asTrivia: AST.Trivia? { + guard case .trivia(let t) = self else { return nil } + return t + } + public var isSemantic: Bool { !isTrivia } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index ff72fca01..a6dfa0ce9 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2039,20 +2039,11 @@ extension Source { return AST.Atom(kind.value, kind.location) } - /// Try to lex the end of a range in a custom character class, which consists - /// of a '-' character followed by an atom. - mutating func lexCustomCharClassRangeEnd( - context: ParsingContext - ) throws -> (dashLoc: SourceLocation, AST.Atom)? { - // Make sure we don't have a binary operator e.g '--', and the '-' is not - // ending the custom character class (in which case it is literal). - guard peekCCBinOp() == nil, !starts(with: "-]"), - let dash = tryEatWithLoc("-"), - let end = try lexAtom(context: context) - else { - return nil - } - return (dash, end) + /// Try to lex the range operator '-' for a custom character class. + mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? { + // Eat a '-', making sure we don't have a binary op such as '--'. + guard peekCCBinOp() == nil else { return nil } + return tryEatWithLoc("-") } /// Try to consume a newline sequence matching option kind. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 3f5bfd69f..84957220c 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -495,43 +495,72 @@ extension Parser { return CustomCC(start, members, loc(start.location.start)) } + mutating func parseCCCMember() throws -> CustomCC.Member? { + guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil + else { return nil } + + // Nested custom character class. + if let cccStart = source.lexCustomCCStart() { + return .custom(try parseCustomCharacterClass(cccStart)) + } + + // Quoted sequence. + if let quote = try source.lexQuote(context: context) { + return .quote(quote) + } + + // Lex triva if we're allowed. + if let trivia = try source.lexTrivia(context: context) { + return .trivia(trivia) + } + + if let atom = try source.lexAtom(context: context) { + return .atom(atom) + } + return nil + } + mutating func parseCCCMembers( into members: inout Array ) throws { // Parse members until we see the end of the custom char class or an // operator. - while !source.isEmpty && source.peek() != "]" && - source.peekCCBinOp() == nil { - - // Nested custom character class. - if let cccStart = source.lexCustomCCStart() { - members.append(.custom(try parseCustomCharacterClass(cccStart))) - continue - } - - // Quoted sequence. - if let quote = try source.lexQuote(context: context) { - members.append(.quote(quote)) - continue - } - - // Lex trivia if we're allowed. - if let trivia = try source.lexTrivia(context: context) { - members.append(.trivia(trivia)) - continue - } + while let member = try parseCCCMember() { + members.append(member) + + // If we have an atom, we can try to parse a character class range. Each + // time we parse a component of the range, we append to `members` in case + // it ends up not being a range, and we bail. If we succeed in parsing, we + // remove the intermediate members. + if case .atom(let lhs) = member { + let membersBeforeRange = members.count - 1 + + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } - guard let atom = try source.lexAtom(context: context) else { break } + guard let dash = source.lexCustomCharacterClassRangeOperator() else { + continue + } + // If we can't parse a range, '-' becomes literal, e.g `[6-]`. + members.append(.atom(.init(.char("-"), dash))) - // Range between atoms. - if let (dashLoc, rhs) = - try source.lexCustomCharClassRangeEnd(context: context) { - members.append(.range(.init(atom, dashLoc, rhs))) - continue + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let rhs = try parseCCCMember() else { continue } + members.append(rhs) + + guard case let .atom(rhs) = rhs else { continue } + + // We've successfully parsed an atom LHS and RHS, so form a range, + // collecting the trivia we've parsed, and replacing the members that + // would have otherwise been added to the custom character class. + let rangeMemberCount = members.count - membersBeforeRange + let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) + members.removeLast(rangeMemberCount) + members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) } - - members.append(.atom(atom)) - continue } } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 1001d23f9..49a08430d 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -416,7 +416,7 @@ func prop_m( func range_m( _ lower: AST.Atom, _ upper: AST.Atom ) -> AST.CustomCharacterClass.Member { - .range(.init(lower, .fake, upper)) + .range(.init(lower, .fake, upper, trivia: [])) } func range_m( _ lower: AST.Atom.Kind, _ upper: AST.Atom.Kind diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 02a1c6c49..985fb70c6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -594,6 +594,8 @@ extension RegexTests { firstMatchTest("[a-z]", input: "123ABCxyz", match: "x") firstMatchTest("[a-z]", input: "123-abcxyz", match: "a") + firstMatchTest("(?x)[ a - z ]+", input: " 123-abcxyz", match: "abcxyz") + // Character class subtraction firstMatchTest("[a-d--a-c]", input: "123abcdxyz", match: "d") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index a599e8a1b..6269cdff6 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -517,6 +517,8 @@ extension RegexTests { "[a-b-c]", charClass(range_m("a", "b"), "-", "c")) parseTest("[-a-]", charClass("-", "a", "-")) + parseTest("[[a]-]", charClass(charClass("a"), "-")) + parseTest("[[a]-b]", charClass(charClass("a"), "-", "b")) parseTest("[a-z]", charClass(range_m("a", "z"))) parseTest("[a-a]", charClass(range_m("a", "a"))) @@ -679,6 +681,16 @@ extension RegexTests { throwsError: .unsupported ) + parseTest(#"(?x)[ a - b ]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + + parseTest(#"(?x)[a - b]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + // MARK: Operators parseTest( @@ -2120,6 +2132,17 @@ extension RegexTests { /# """#, charClass("a", range_m("b", "c"), "d")) + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + - #a + b + ] + /# + """#, charClass(range_m("a", "b"))) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax.