diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index 614048f0a..19e72aef5 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -97,6 +97,10 @@ extension CustomCC.Member { if case .trivia = self { return true } return false } + + public var isSemantic: Bool { + !isTrivia + } } extension AST.CustomCharacterClass { @@ -104,7 +108,7 @@ extension AST.CustomCharacterClass { /// nested custom character classes. public var strippingTriviaShallow: Self { var copy = self - copy.members = copy.members.filter { !$0.isTrivia } + copy.members = copy.members.filter(\.isSemantic) return copy } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c3aa3500b..b24097b83 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -438,16 +438,18 @@ extension Parser { defer { context.isInCustomCharacterClass = alreadyInCCC } typealias Member = CustomCC.Member - try source.expectNonEmpty() - var members: Array = [] + try parseCCCMembers(into: &members) - // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty - // character classes, and assume an initial ']' is literal. - if let loc = source.tryEatWithLoc("]") { - members.append(.atom(.init(.char("]"), loc))) + // If we didn't parse any semantic members, we can eat a ']' character, as + // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an + // initial ']' is literal. + if members.none(\.isSemantic) { + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + try parseCCCMembers(into: &members) + } } - try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. @@ -458,8 +460,9 @@ extension Parser { var rhs: Array = [] try parseCCCMembers(into: &rhs) - if members.isEmpty || rhs.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) || rhs.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } // If we're done, bail early @@ -472,8 +475,9 @@ extension Parser { // Otherwise it's just another member to accumulate members = [setOp] } - if members.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } try source.expect("]") return CustomCC(start, members, loc(start.location.start)) @@ -484,7 +488,8 @@ extension Parser { ) throws { // Parse members until we see the end of the custom char class or an // operator. - while source.peek() != "]" && source.peekCCBinOp() == nil { + while !source.isEmpty && source.peek() != "]" && + source.peekCCBinOp() == nil { // Nested custom character class. if let cccStart = try source.lexCustomCCStart() { diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 0e40ad2ce..986f3d86e 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -312,7 +312,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable { case .quote(let q): return "\(q)" case .trivia(let t): return "\(t)" case .setOperation(let lhs, let op, let rhs): - return "op \(lhs) \(op.value) \(rhs)" + // TODO: We should eventually have some way of filtering out trivia for + // tests, so that it can appear in regular dumps. + return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b343862ef..49e459c0b 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -287,11 +287,8 @@ extension DSLTree.CustomCharacterClass.Member { } case .trivia: // TODO: Should probably strip this earlier... - return { _, bounds in - return bounds.lowerBound - } + return { _, _ in nil } } - } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index f1074c033..448ff3211 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1208,6 +1208,19 @@ extension RegexTests { ("CaFe", true), ("EfAc", true)) } + + func testNonSemanticWhitespace() { + firstMatchTest(#" \t "#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx) \t "#, input: " \t ", match: "\t") + + firstMatchTest(#"[ \t]+"#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") + + firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa") + firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a") + } func testASCIIClasses() { // 'D' ASCII-only digits diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e9b422379..65dd6ed09 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -460,9 +460,25 @@ extension RegexTests { parseTest("[-]", charClass("-")) - // Empty character classes are forbidden, therefore this is a character - // class of literal ']'. + // Empty character classes are forbidden, therefore these are character + // classes containing literal ']'. parseTest("[]]", charClass("]")) + parseTest("[]a]", charClass("]", "a")) + parseTest( + "(?x)[ ]]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]")) + ) + parseTest( + "(?x)[ ] ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]")) + ) + parseTest( + "(?x)[ ] a ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + charClass("]", "a")) + ) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -613,6 +629,16 @@ extension RegexTests { parseTest( "~~*", concat("~", zeroOrMore(of: "~"))) + parseTest( + "[ && ]", charClass( + .setOperation([" "], .init(faking: .intersection), [" ", " "])) + ) + parseTest( + "(?x)[ a && b ]", changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, charClass( + .setOperation(["a"], .init(faking: .intersection), ["b"])) + )) + // MARK: Quotes parseTest( @@ -2205,6 +2231,9 @@ extension RegexTests { diagnosticTest(")))", .unbalancedEndOfGroup) diagnosticTest("())()", .unbalancedEndOfGroup) + diagnosticTest("[", .expectedCustomCharacterClassMembers) + diagnosticTest("[^", .expectedCustomCharacterClassMembers) + diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) @@ -2245,9 +2274,21 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) - // The first ']' of a custom character class is literal, so this is missing - // the closing bracket. + // MARK: Character classes + + diagnosticTest("[a", .expected("]")) + + // The first ']' of a custom character class is literal, so these are + // missing the closing bracket. diagnosticTest("[]", .expected("]")) + diagnosticTest("(?x)[ ]", .expected("]")) + + diagnosticTest("[&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers) diagnosticTest("[:a", .expected("]")) diagnosticTest("[:a:", .expected("]"))