From e3cdcb56509815431b0c6fbb09ec4294b50ee3ee Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:41 +0100 Subject: [PATCH 1/9] Introduce Source.lookahead Use this to replace the various places we're doing `var src = self`. --- .../Regex/Parse/LexicalAnalysis.swift | 86 +++++++++++-------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index cbc9d5eb2..936e6b189 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -149,6 +149,14 @@ extension Source { return result } + /// Perform a lookahead using a temporary source. Within the body of the + /// lookahead, any modifications to the source will not be reflected outside + /// the body. + func lookahead(_ body: (inout Source) throws -> T) rethrows -> T { + var src = self + return try body(&src) + } + /// Attempt to eat the given character, returning its source location if /// successful, `nil` otherwise. mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { @@ -1240,8 +1248,9 @@ extension Source { private func canLexPOSIXCharacterProperty() -> Bool { do { - var src = self - return try src.lexPOSIXCharacterProperty() != nil + return try lookahead { src in + try src.lexPOSIXCharacterProperty() != nil + } } catch { // We want to tend on the side of lexing a POSIX character property, so // even if it is invalid in some way (e.g invalid property names), still @@ -1394,10 +1403,11 @@ extension Source { /// Checks whether a numbered reference can be lexed. private func canLexNumberedReference() -> Bool { - var src = self - _ = src.tryEat(anyOf: "+", "-") - guard let next = src.peek() else { return false } - return RadixKind.decimal.characterFilter(next) + lookahead { src in + _ = src.tryEat(anyOf: "+", "-") + guard let next = src.peek() else { return false } + return RadixKind.decimal.characterFilter(next) + } } /// Eat a named reference up to a given closing delimiter. @@ -1587,53 +1597,55 @@ extension Source { /// Whether we can lex a group-like reference after the specifier '(?'. private func canLexGroupLikeReference() -> Bool { - var src = self - if src.tryEat("P") { - return src.tryEat(anyOf: "=", ">") != nil - } - if src.tryEat(anyOf: "&", "R") != nil { - return true + lookahead { src in + if src.tryEat("P") { + return src.tryEat(anyOf: "=", ">") != nil + } + if src.tryEat(anyOf: "&", "R") != nil { + return true + } + return src.canLexNumberedReference() } - return src.canLexNumberedReference() } private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { - var src = self - - // See if we can lex a matching option sequence that terminates in ')'. Such - // a sequence is an atom. If an error is thrown, there are invalid elements - // of the matching option sequence. In such a case, we can lex as a group - // and diagnose the invalid group kind. - guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { - return false + lookahead { src in + // See if we can lex a matching option sequence that terminates in ')'. + // Such a sequence is an atom. If an error is thrown, there are invalid + // elements of the matching option sequence. In such a case, we can lex as + // a group and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") } - return src.tryEat(")") } /// Whether a group specifier should be lexed as an atom instead of a group. private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { - var src = self - guard src.tryEat("(") else { return false } + lookahead { src in + guard src.tryEat("(") else { return false } - if src.tryEat("?") { - // The start of a reference '(?P=', '(?R', ... - if src.canLexGroupLikeReference() { return true } + if src.tryEat("?") { + // The start of a reference '(?P=', '(?R', ... + if src.canLexGroupLikeReference() { return true } - // The start of a PCRE callout. - if src.tryEat("C") { return true } + // The start of a PCRE callout. + if src.tryEat("C") { return true } - // The start of an Oniguruma 'of-contents' callout. - if src.tryEat("{") { return true } + // The start of an Oniguruma 'of-contents' callout. + if src.tryEat("{") { return true } - // A matching option atom (?x), (?i), ... - if src.canLexMatchingOptionsAsAtom(context: context) { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + + return false + } + // The start of a backreference directive or Oniguruma named callout. + if src.tryEat("*") { return true } return false } - // The start of a backreference directive or Oniguruma named callout. - if src.tryEat("*") { return true } - - return false } /// Consume an escaped atom, starting from after the backslash From 8ddacdb2d1483800d6573c28ca3ed782eddf1b0b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:42 +0100 Subject: [PATCH 2/9] Remove `throws` from a couple of lexing methods --- .../Regex/Parse/LexicalAnalysis.swift | 15 +++++---------- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 +++--- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 936e6b189..0d4928406 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -421,9 +421,7 @@ extension Source { ) throws -> (Located, Located, [AST.Trivia])? { var trivia: [AST.Trivia] = [] - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let amt: Located? = try recordLoc { src in if src.tryEat("*") { return .zeroOrMore } @@ -441,9 +439,7 @@ extension Source { guard let amt = amt else { return nil } // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = try lexNonSemanticWhitespace(context: context) { - trivia.append(t) - } + if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } let kind: Located = recordLoc { src in if src.tryEat("?") { return .reluctant } @@ -675,7 +671,7 @@ extension Source { /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set mutating func lexNonSemanticWhitespace( context: ParsingContext - ) throws -> AST.Trivia? { + ) -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } // FIXME: PCRE only treats space and tab characters as whitespace when @@ -707,7 +703,7 @@ extension Source { if let comment = try lexComment(context: context) { return comment } - if let whitespace = try lexNonSemanticWhitespace(context: context) { + if let whitespace = lexNonSemanticWhitespace(context: context) { return whitespace } return nil @@ -1186,8 +1182,7 @@ extension Source { } } - mutating func lexCustomCCStart( - ) throws -> Located? { + mutating func lexCustomCCStart() -> Located? { recordLoc { src in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index a3ddfb812..c31380c9d 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -429,7 +429,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = source.lexCustomCCStart() { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -513,7 +513,7 @@ extension Parser { source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = source.lexCustomCCStart() { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } @@ -527,7 +527,7 @@ extension Parser { // Lex non-semantic whitespace if we're allowed. // TODO: ICU allows end-of-line comments in custom character classes, // which we ought to support if we want to support multi-line regex. - if let trivia = try source.lexNonSemanticWhitespace(context: context) { + if let trivia = source.lexNonSemanticWhitespace(context: context) { members.append(.trivia(trivia)) continue } From 2227777e2bd40d6964ed56f3ccc82b3eabc5b37b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:42 +0100 Subject: [PATCH 3/9] Add ASTBuilder helper for char class set operations --- .../Utility/ASTBuilder.swift | 8 ++++ Tests/RegexTests/ParseTests.swift | 38 +++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 78477e2b5..1001d23f9 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -319,6 +319,14 @@ func charClass( return .custom(cc) } +func setOp( + _ lhs: AST.CustomCharacterClass.Member..., + op: AST.CustomCharacterClass.SetOp, + _ rhs: AST.CustomCharacterClass.Member... +) -> AST.CustomCharacterClass.Member { + .setOperation(lhs, .init(faking: op), rhs) +} + func quote(_ s: String) -> AST.Node { .quote(.init(s, .fake)) } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 975d86b75..0a708c5f2 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -522,9 +522,8 @@ extension RegexTests { parseTest("[a-a]", charClass(range_m("a", "a"))) parseTest("[B-a]", charClass(range_m("B", "a"))) - // FIXME: AST builder helpers for custom char class types parseTest("[a-d--a-c]", charClass( - .setOperation([range_m("a", "d")], .init(faking: .subtraction), [range_m("a", "c")]) + setOp(range_m("a", "d"), op: .subtraction, range_m("a", "c")) )) parseTest("[-]", charClass("-")) @@ -684,30 +683,31 @@ extension RegexTests { parseTest( #"[a[bc]de&&[^bc]\d]+"#, - oneOrMore(of: charClass( - .setOperation( - ["a", charClass("b", "c"), "d", "e"], - .init(faking: .intersection), - [charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit))] - )))) + oneOrMore(of: charClass(setOp( + "a", charClass("b", "c"), "d", "e", + op: .intersection, + charClass("b", "c", inverted: true), atom_m(.escaped(.decimalDigit)) + ))) + ) parseTest( - "[a&&b]", - charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"]))) + "[a&&b]", charClass(setOp("a", op: .intersection, "b")) + ) parseTest( "[abc--def]", - charClass(.setOperation(["a", "b", "c"], .init(faking: .subtraction), ["d", "e", "f"]))) + charClass(setOp("a", "b", "c", op: .subtraction, "d", "e", "f")) + ) // We left-associate for chained operators. parseTest( "[ab&&b~~cd]", - charClass( - .setOperation( - [.setOperation(["a", "b"], .init(faking: .intersection), ["b"])], - .init(faking: .symmetricDifference), - ["c", "d"]))) + charClass(setOp( + setOp("a", "b", op: .intersection, "b"), + op: .symmetricDifference, + "c", "d" + )) + ) // Operators are only valid in custom character classes. parseTest( @@ -723,11 +723,11 @@ extension RegexTests { parseTest( "[ && ]", - charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) + charClass(setOp(" ", op: .intersection, " ", " ")) ) parseTest("(?x)[ a && b ]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + charClass(setOp("a", op: .intersection, "b")) )) // MARK: Quotes From acd3d1ce89cb0e63cb7548855e37b01ce772f4c1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 4/9] Simplify character class parsing a little We don't have to handle bailing early, the loop will terminate if we don't lex another operator. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c31380c9d..1cdb2c84e 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -485,16 +485,7 @@ extension Parser { throw Source.LocatedError( ParseError.expectedCustomCharacterClassMembers, start.location) } - - // If we're done, bail early - let setOp = Member.setOperation(members, binOp, rhs) - if source.tryEat("]") { - return CustomCC( - start, [setOp], loc(start.location.start)) - } - - // Otherwise it's just another member to accumulate - members = [setOp] + members = [.setOperation(members, binOp, rhs)] } if members.none(\.isSemantic) { throw Source.LocatedError( From b721ea74c396f0c0f4d0c4a7e55c820f6ca8d28c Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 5/9] Dump the inverted bit of a custom character class Make sure an inverted character class does not dump the same as a regular character class. --- Sources/_RegexParser/Regex/Printing/DumpAST.swift | 2 +- Tests/RegexTests/ParseTests.swift | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 110b7b480..10e50d712 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -309,7 +309,7 @@ extension AST.CustomCharacterClass: _ASTNode { // comparisons of dumped output in tests. // TODO: We should eventually have some way of filtering out trivia for // tests, so that it can appear in regular dumps. - return "customCharacterClass(\(strippingTriviaShallow.members))" + return "customCharacterClass(inverted: \(isInverted), \(strippingTriviaShallow.members))" } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0a708c5f2..05d37e163 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2210,6 +2210,8 @@ extension RegexTests { parseNotEqualTest(#"[abc]"#, #"[a b c]"#) + parseNotEqualTest("[abc]", "[^abc]") + parseNotEqualTest(#"\1"#, #"\10"#) parseNotEqualTest("(?^:)", ("(?-:)")) From 033c4bb32ce31aab7bd7fe0f833c724e3c43840a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:43 +0100 Subject: [PATCH 6/9] Allow empty comments `expectQuoted` expects non-empty contents, which doesn't apply to comments. --- Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift | 4 ++-- Tests/RegexTests/ParseTests.swift | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 0d4928406..d13edde57 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -630,10 +630,10 @@ extension Source { mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in if src.tryEat(sequence: "(?#") { - return try src.expectQuoted(endingWith: ")").value + return try src.lexUntil(eating: ")").value } if context.experimentalComments, src.tryEat(sequence: "/*") { - return try src.expectQuoted(endingWith: "*/").value + return try src.lexUntil(eating: "*/").value } if context.endOfLineComments, src.tryEat("#") { // Try eat until we either exhaust the input, or hit a newline. Note diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 05d37e163..fa8aa70d9 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1648,6 +1648,9 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) + parseTest("(?#)", empty()) + parseTest("/**/", empty(), syntax: .experimental) + // MARK: Matching option changing parseTest( From 7fe243ebe7e7e0630f198780f07bb649630a4efc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 7/9] Lex whitespace in range quantifiers PCRE does not allow whitespace here, instead treating the sequence as literal if whitespace is present. However this behavior is quite unintuitive. Instead, lex whitespace between range operands. --- .../Regex/Parse/LexicalAnalysis.swift | 14 ++++++++++++-- Tests/RegexTests/MatchTests.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 16 +++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index d13edde57..f1038da57 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -430,7 +430,7 @@ extension Source { return try src.tryEating { src in guard src.tryEat("{"), - let range = try src.lexRange(context: context), + let range = try src.lexRange(context: context, trivia: &trivia), src.tryEat("}") else { return nil } return range.value @@ -456,11 +456,17 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange(context: ParsingContext) throws -> Located? { + mutating func lexRange( + context: ParsingContext, trivia: inout [AST.Trivia] + ) throws -> Located? { try recordLoc { src in try src.tryEating { src in + if let t = src.lexWhitespace() { trivia.append(t) } + let lowerOpt = try src.lexNumber() + if let t = src.lexWhitespace() { trivia.append(t) } + // ',' or '...' or '..<' or nothing // TODO: We ought to try and consume whitespace here and emit a // diagnostic for the user warning them that it would cause the range to @@ -480,11 +486,15 @@ extension Source { closedRange = nil } + if let t = src.lexWhitespace() { trivia.append(t) } + let upperOpt = try src.lexNumber()?.map { upper in // If we have an open range, the upper bound should be adjusted down. closedRange == true ? upper : upper - 1 } + if let t = src.lexWhitespace() { trivia.append(t) } + switch (lowerOpt, closedRange, upperOpt) { case let (l?, nil, nil): return .exactly(l) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 496e02658..d4de5eede 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -341,8 +341,12 @@ extension RegexTests { firstMatchTest( #"a{1,2}"#, input: "123aaaxyz", match: "aa") + firstMatchTest( + #"a{ 1 , 2 }"#, input: "123aaaxyz", match: "aa") firstMatchTest( #"a{,2}"#, input: "123aaaxyz", match: "") + firstMatchTest( + #"a{ , 2 }"#, input: "123aaaxyz", match: "") firstMatchTest( #"a{,2}x"#, input: "123aaaxyz", match: "aax") firstMatchTest( @@ -351,6 +355,8 @@ extension RegexTests { #"a{2,}"#, input: "123aaaxyz", match: "aaa") firstMatchTest( #"a{1}"#, input: "123aaaxyz", match: "a") + firstMatchTest( + #"a{ 1 }"#, input: "123aaaxyz", match: "a") firstMatchTest( #"a{1,2}?"#, input: "123aaaxyz", match: "a") firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index fa8aa70d9..e389ea5d2 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -832,6 +832,10 @@ extension RegexTests { #"a{1,1}"#, quantRange(1...1, of: "a")) + parseTest("x{3, 5}", quantRange(3 ... 5, of: "x")) + parseTest("x{ 3 , 5 }", quantRange(3 ... 5, of: "x")) + parseTest("x{3 }", exactly(3, of: "x")) + // Make sure ranges get treated as literal if invalid. parseTest("{", "{") parseTest("{,", concat("{", ",")) @@ -851,11 +855,6 @@ extension RegexTests { parseTest("x{+", concat("x", oneOrMore(of: "{"))) parseTest("x{6,+", concat("x", "{", "6", oneOrMore(of: ","))) - // TODO: We should emit a diagnostic for this. - parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}")) - parseTest("{3, 5}", concat("{", "3", ",", " ", "5", "}")) - parseTest("{3 }", concat("{", "3", " ", "}")) - // MARK: Groups // Named captures @@ -1771,10 +1770,10 @@ extension RegexTests { // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 - // TODO: We ought to warn on this, and produce a range anyway. + // We however do ignore it. parseTest("(?x)a{1, 3}", concat( changeMatchingOptions(matchingOptions(adding: .extended)), - "a", "{", "1", ",", "3", "}" + quantRange(1 ... 3, of: "a") )) // Test that we cover the list of whitespace characters covered by PCRE. @@ -2695,6 +2694,9 @@ extension RegexTests { diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2)) + diagnosticTest("{3, 5}", .quantifierRequiresOperand("{3, 5}")) + diagnosticTest("{3 }", .quantifierRequiresOperand("{3 }")) + // These are not quantifiable. diagnosticTest(#"\b?"#, .notQuantifiable) diagnosticTest(#"\B*"#, .notQuantifiable) From a6bf9b816eb835121c15cc5fd1b96b589d4d0aa3 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 8/9] Parse end-of-line comments in custom character classes Previously we would only parse non-semantic whitespace, but also expand to end-of-line comments, which are supported by ICU. --- .../Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 ++--- Tests/RegexTests/ParseTests.swift | 23 ++++++++++++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f1038da57..ff72fca01 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -639,7 +639,7 @@ extension Source { /// mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { let trivia: Located? = try recordLoc { src in - if src.tryEat(sequence: "(?#") { + if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") { return try src.lexUntil(eating: ")").value } if context.experimentalComments, src.tryEat(sequence: "/*") { diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 1cdb2c84e..3f5bfd69f 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -515,10 +515,8 @@ extension Parser { continue } - // Lex non-semantic whitespace if we're allowed. - // TODO: ICU allows end-of-line comments in custom character classes, - // which we ought to support if we want to support multi-line regex. - if let trivia = source.lexNonSemanticWhitespace(context: context) { + // Lex trivia if we're allowed. + if let trivia = try source.lexTrivia(context: context) { members.append(.trivia(trivia)) continue } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e389ea5d2..1f7d64002 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1677,12 +1677,8 @@ extension RegexTests { ) ) - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. - parseTest("(?x)[ # abc]", concat( - changeMatchingOptions(matchingOptions(adding: .extended)), - charClass("#", "a", "b", "c") - )) + parseTest("[ # abc]", charClass(" ", "#", " ", "a", "b", "c")) + parseTest("[#]", charClass("#")) parseTest("(?x)a b c[d e f]", concat( changeMatchingOptions(matchingOptions(adding: .extended)), @@ -2115,6 +2111,15 @@ extension RegexTests { /# """#, scalarSeq("\u{AB}", "\u{B}", "\u{C}")) + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + b-c #a + d] + /# + """#, charClass("a", range_m("b", "c"), "d")) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2544,6 +2549,12 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest("(?x)[(?#)]", .expected("]")) + diagnosticTest("(?x)[(?#abc)]", .expected("]")) + + diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) From 4328e738d93857ef87336528c38ec5131e1baf92 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 24 May 2022 11:05:44 +0100 Subject: [PATCH 9/9] Allow trivia between character class range operands Factor out the logic that deals with parsing an individual character class member, and interleave `lexTrivia` calls between range operand parsing. --- .../Regex/AST/CustomCharClass.swift | 12 ++- .../Regex/Parse/LexicalAnalysis.swift | 19 ++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 87 ++++++++++++------- .../Utility/ASTBuilder.swift | 2 +- Tests/RegexTests/MatchTests.swift | 2 + Tests/RegexTests/ParseTests.swift | 23 +++++ 6 files changed, 100 insertions(+), 45 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index d28490f25..087387c1e 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -51,11 +51,16 @@ extension AST { public var lhs: Atom public var dashLoc: SourceLocation public var rhs: Atom + public var trivia: [AST.Trivia] - public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) { + public init( + _ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom, + trivia: [AST.Trivia] + ) { self.lhs = lhs self.dashLoc = dashLoc self.rhs = rhs + self.trivia = trivia } } public enum SetOp: String, Hashable { @@ -95,6 +100,11 @@ extension CustomCC.Member { return false } + public var asTrivia: AST.Trivia? { + guard case .trivia(let t) = self else { return nil } + return t + } + public var isSemantic: Bool { !isTrivia } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index ff72fca01..a6dfa0ce9 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2039,20 +2039,11 @@ extension Source { return AST.Atom(kind.value, kind.location) } - /// Try to lex the end of a range in a custom character class, which consists - /// of a '-' character followed by an atom. - mutating func lexCustomCharClassRangeEnd( - context: ParsingContext - ) throws -> (dashLoc: SourceLocation, AST.Atom)? { - // Make sure we don't have a binary operator e.g '--', and the '-' is not - // ending the custom character class (in which case it is literal). - guard peekCCBinOp() == nil, !starts(with: "-]"), - let dash = tryEatWithLoc("-"), - let end = try lexAtom(context: context) - else { - return nil - } - return (dash, end) + /// Try to lex the range operator '-' for a custom character class. + mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? { + // Eat a '-', making sure we don't have a binary op such as '--'. + guard peekCCBinOp() == nil else { return nil } + return tryEatWithLoc("-") } /// Try to consume a newline sequence matching option kind. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 3f5bfd69f..84957220c 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -495,43 +495,72 @@ extension Parser { return CustomCC(start, members, loc(start.location.start)) } + mutating func parseCCCMember() throws -> CustomCC.Member? { + guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil + else { return nil } + + // Nested custom character class. + if let cccStart = source.lexCustomCCStart() { + return .custom(try parseCustomCharacterClass(cccStart)) + } + + // Quoted sequence. + if let quote = try source.lexQuote(context: context) { + return .quote(quote) + } + + // Lex triva if we're allowed. + if let trivia = try source.lexTrivia(context: context) { + return .trivia(trivia) + } + + if let atom = try source.lexAtom(context: context) { + return .atom(atom) + } + return nil + } + mutating func parseCCCMembers( into members: inout Array ) throws { // Parse members until we see the end of the custom char class or an // operator. - while !source.isEmpty && source.peek() != "]" && - source.peekCCBinOp() == nil { - - // Nested custom character class. - if let cccStart = source.lexCustomCCStart() { - members.append(.custom(try parseCustomCharacterClass(cccStart))) - continue - } - - // Quoted sequence. - if let quote = try source.lexQuote(context: context) { - members.append(.quote(quote)) - continue - } - - // Lex trivia if we're allowed. - if let trivia = try source.lexTrivia(context: context) { - members.append(.trivia(trivia)) - continue - } + while let member = try parseCCCMember() { + members.append(member) + + // If we have an atom, we can try to parse a character class range. Each + // time we parse a component of the range, we append to `members` in case + // it ends up not being a range, and we bail. If we succeed in parsing, we + // remove the intermediate members. + if case .atom(let lhs) = member { + let membersBeforeRange = members.count - 1 + + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } - guard let atom = try source.lexAtom(context: context) else { break } + guard let dash = source.lexCustomCharacterClassRangeOperator() else { + continue + } + // If we can't parse a range, '-' becomes literal, e.g `[6-]`. + members.append(.atom(.init(.char("-"), dash))) - // Range between atoms. - if let (dashLoc, rhs) = - try source.lexCustomCharClassRangeEnd(context: context) { - members.append(.range(.init(atom, dashLoc, rhs))) - continue + while let t = try source.lexTrivia(context: context) { + members.append(.trivia(t)) + } + guard let rhs = try parseCCCMember() else { continue } + members.append(rhs) + + guard case let .atom(rhs) = rhs else { continue } + + // We've successfully parsed an atom LHS and RHS, so form a range, + // collecting the trivia we've parsed, and replacing the members that + // would have otherwise been added to the custom character class. + let rangeMemberCount = members.count - membersBeforeRange + let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia) + members.removeLast(rangeMemberCount) + members.append(.range(.init(lhs, dash, rhs, trivia: trivia))) } - - members.append(.atom(atom)) - continue } } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 1001d23f9..49a08430d 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -416,7 +416,7 @@ func prop_m( func range_m( _ lower: AST.Atom, _ upper: AST.Atom ) -> AST.CustomCharacterClass.Member { - .range(.init(lower, .fake, upper)) + .range(.init(lower, .fake, upper, trivia: [])) } func range_m( _ lower: AST.Atom.Kind, _ upper: AST.Atom.Kind diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d4de5eede..5592ab24a 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -594,6 +594,8 @@ extension RegexTests { firstMatchTest("[a-z]", input: "123ABCxyz", match: "x") firstMatchTest("[a-z]", input: "123-abcxyz", match: "a") + firstMatchTest("(?x)[ a - z ]+", input: " 123-abcxyz", match: "abcxyz") + // Character class subtraction firstMatchTest("[a-d--a-c]", input: "123abcdxyz", match: "d") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 1f7d64002..06f05e4ed 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -517,6 +517,8 @@ extension RegexTests { "[a-b-c]", charClass(range_m("a", "b"), "-", "c")) parseTest("[-a-]", charClass("-", "a", "-")) + parseTest("[[a]-]", charClass(charClass("a"), "-")) + parseTest("[[a]-b]", charClass(charClass("a"), "-", "b")) parseTest("[a-z]", charClass(range_m("a", "z"))) parseTest("[a-a]", charClass(range_m("a", "a"))) @@ -679,6 +681,16 @@ extension RegexTests { throwsError: .unsupported ) + parseTest(#"(?x)[ a - b ]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + + parseTest(#"(?x)[a - b]"#, concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(range_m("a", "b")) + )) + // MARK: Operators parseTest( @@ -2120,6 +2132,17 @@ extension RegexTests { /# """#, charClass("a", range_m("b", "c"), "d")) + parseWithDelimitersTest(#""" + #/ + [ + a # interesting + - #a + b + ] + /# + """#, charClass(range_m("a", "b"))) + + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax.