diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index c48d53de9..bf53d0ab1 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1064,11 +1064,16 @@ extension Source { } mutating func lexCustomCCStart( + context: ParsingContext ) throws -> Located? { recordLoc { src in - // POSIX named sets are atoms. - guard !src.starts(with: "[:") else { return nil } - + // Make sure we don't have a POSIX character property. This may require + // walking to its ending to make sure we have a closing ':]', as otherwise + // we have a custom character class. + // TODO: This behavior seems subtle, could we warn? + guard !src.canLexPOSIXCharacterProperty(context: context) else { + return nil + } if src.tryEat("[") { return src.tryEat("^") ? .inverted : .normal } @@ -1099,12 +1104,38 @@ extension Source { } private mutating func lexPOSIXCharacterProperty( + context: ParsingContext ) throws -> Located? { - try recordLoc { src in - guard src.tryEat(sequence: "[:") else { return nil } - let inverted = src.tryEat("^") - let prop = try src.lexCharacterPropertyContents(end: ":]").value - return .init(prop, isInverted: inverted, isPOSIX: true) + // Only allowed in a custom character class. + guard context.isInCustomCharacterClass else { return nil } + return try recordLoc { src in + try src.tryEating { src in + guard src.tryEat(sequence: "[:") else { return nil } + let inverted = src.tryEat("^") + + // Note we lex the contents and ending *before* classifying, because we + // want to bail with nil if we don't have the right ending. This allows + // the lexing of a custom character class if we don't have a ':]' + // ending. + let (key, value) = src.lexCharacterPropertyKeyValue() + guard src.tryEat(sequence: ":]") else { return nil } + + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + return .init(prop, isInverted: inverted, isPOSIX: true) + } + } + } + + private func canLexPOSIXCharacterProperty(context: ParsingContext) -> Bool { + do { + var src = self + return try src.lexPOSIXCharacterProperty(context: context) != nil + } catch { + // We want to tend on the side of lexing a POSIX character property, so + // even if it is invalid in some way (e.g invalid property names), still + // try and lex it. + return true } } @@ -1129,26 +1160,52 @@ extension Source { } } - private mutating func lexCharacterPropertyContents( - end: String - ) throws -> Located { - try recordLoc { src in - // We should either have: - // - 'x=y' where 'x' is a property key, and 'y' is a value. - // - 'y' where 'y' is a value (or a bool key with an inferred value - // of true), and its key is inferred. - // TODO: We could have better recovery here if we only ate the characters - // that property keys and values can use. - let lhs = src.lexUntil { - $0.isEmpty || $0.peek() == "=" || $0.starts(with: end) - }.value - if src.tryEat("=") { - let rhs = try src.lexUntil(eating: end).value - return try Source.classifyCharacterProperty(key: lhs, value: rhs) + private mutating func lexCharacterPropertyKeyValue( + ) -> (key: String?, value: String) { + func atPossibleEnding(_ src: inout Source) -> Bool { + guard let next = src.peek() else { return true } + switch next { + case "=": + // End of a key. + return true + case ":", "[", "]": + // POSIX character property endings to cover ':]', ']', and '[' as the + // start of a nested character class. + return true + case "}": + // Ending of '\p{'. We cover this for POSIX too as it's not a valid + // character property name anyway, and it's nice not to have diverging + // logic for these cases. + return true + default: + // We may want to handle other metacharacters here, e.g '{', '(', ')', + // as they're not valid character property names. However for now + // let's tend on the side of forming an unknown property name in case + // these characters are ever used in future character property names + // (though it's very unlikely). Users can always escape e.g the ':' + // in '[:' if they definitely want a custom character class. + return false } - try src.expect(sequence: end) - return try Source.classifyCharacterPropertyValueOnly(lhs) } + // We should either have: + // - 'x=y' where 'x' is a property key, and 'y' is a value. + // - 'y' where 'y' is a value (or a bool key with an inferred value of true) + // and its key is inferred. + let lhs = lexUntil(atPossibleEnding).value + if tryEat("=") { + let rhs = lexUntil(atPossibleEnding).value + return (lhs, rhs) + } + return (nil, lhs) + } + + private static func classifyCharacterPropertyContents( + key: String?, value: String + ) throws -> AST.Atom.CharacterProperty.Kind { + if let key = key { + return try classifyCharacterProperty(key: key, value: value) + } + return try classifyCharacterPropertyValueOnly(value) } /// Try to consume a character property. @@ -1164,7 +1221,10 @@ extension Source { let isInverted = src.peek() == "P" src.advance(2) - let prop = try src.lexCharacterPropertyContents(end: "}").value + let (key, value) = src.lexCharacterPropertyKeyValue() + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + try src.expect("}") return .init(prop, isInverted: isInverted, isPOSIX: false) } } @@ -1758,11 +1818,8 @@ extension Source { if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful - // POSIX character property. This is only allowed in a custom character - // class. - // TODO: Can we try and recover and diagnose these outside character - // classes? - if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value { + // POSIX character property. + if let prop = try src.lexPOSIXCharacterProperty(context: context)?.value { return .property(prop) } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c3aa3500b..d70a1f14f 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -403,7 +403,7 @@ extension Parser { } // Check if we have the start of a custom character class '['. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = try source.lexCustomCCStart(context: context) { return .customCharacterClass( try parseCustomCharacterClass(cccStart)) } @@ -487,7 +487,7 @@ extension Parser { while source.peek() != "]" && source.peekCCBinOp() == nil { // Nested custom character class. - if let cccStart = try source.lexCustomCCStart() { + if let cccStart = try source.lexCustomCCStart(context: context) { members.append(.custom(try parseCustomCharacterClass(cccStart))) continue } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 5e3defb11..aa0ebbf98 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -474,6 +474,26 @@ extension RegexTests { parseTest( "[a^]", charClass("a", "^")) + // These are custom character classes, not invalid POSIX character classes. + // TODO: This behavior is subtle, we ought to warn. + parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e"))) + parseTest("[:space:]", charClass(":", "s", "p", "a", "c", "e", ":")) + parseTest("[:a]", charClass(":", "a")) + parseTest("[a:]", charClass("a", ":")) + parseTest("[:]", charClass(":")) + parseTest("[::]", charClass(":", ":")) + parseTest("[:=:]", charClass(":", "=", ":")) + parseTest("[[:]]", charClass(charClass(":"))) + parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":"))) + + parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":"))) + parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":"))) + parseTest(#"[[:a]]"#, charClass(charClass(":", "a"))) + parseTest(#"[[:}]]"#, charClass(charClass(":", "}"))) + parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) + parseTest(#"[[:{:]]"#, charClass(posixProp_m(.other(key: nil, value: "{")))) + parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + parseTest( #"\D\S\W"#, concat( @@ -1096,9 +1116,13 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) + // TODO: Start erroring on these? parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y"))) + parseTest(#"\p{aaa(b)}"#, prop(.other(key: nil, value: "aaa(b)"))) + parseTest("[[:a():]]", charClass(posixProp_m(.other(key: nil, value: "a()")))) + parseTest(#"\p{aaa\p{b}}"#, concat(prop(.other(key: nil, value: #"aaa\p{b"#)), "}")) // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) @@ -2183,7 +2207,11 @@ extension RegexTests { diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) diagnosticTest(#"\p{a"#, .expected("}")) - diagnosticTest(#"\p{a="#, .expected("}")) + diagnosticTest(#"\p{a="#, .emptyProperty) + diagnosticTest(#"\p{a=}"#, .emptyProperty) + diagnosticTest(#"\p{a=b"#, .expected("}")) + diagnosticTest(#"\p{aaa[b]}"#, .expected("}")) + diagnosticTest(#"\p{a=b=c}"#, .expected("}")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2218,6 +2246,15 @@ extension RegexTests { // the closing bracket. diagnosticTest("[]", .expected("]")) + diagnosticTest("[:a", .expected("]")) + diagnosticTest("[:a:", .expected("]")) + diagnosticTest("[[:a", .expected("]")) + diagnosticTest("[[:a:", .expected("]")) + diagnosticTest("[[:a[:]", .expected("]")) + + diagnosticTest("[[::]]", .emptyProperty) + diagnosticTest("[[:=:]]", .emptyProperty) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape)