From d3bd6ad9544e3bcfd7f84ad8c2afe16517bf604d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:32 +0000 Subject: [PATCH 1/4] Error on unknown escape sequences Throw an error for unknown a-z escape sequences as well as non-ASCII non-whitespace escape sequences. --- .../Regex/Parse/Diagnostics.swift | 3 ++ .../Regex/Parse/LexicalAnalysis.swift | 11 ++++- Tests/RegexTests/ParseTests.swift | 42 ++++++++++++++----- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index b9c99d9d3..d4c809045 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -39,6 +39,7 @@ enum ParseError: Error, Hashable { case expectedNonEmptyContents case expectedEscape + case invalidEscape(Character) case cannotReferToWholePattern @@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible { return "expected non-empty contents" case .expectedEscape: return "expected escape sequence" + case .invalidEscape(let c): + return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .notQuantifiable: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index cfab75312..4eb0ebea4 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1489,8 +1489,17 @@ extension Source { return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: - return .char(char) + break } + + // We only allow unknown escape sequences for non-letter ASCII, and + // non-ASCII whitespace. + guard (char.isASCII && !char.isLetter) || + (!char.isASCII && char.isWhitespace) + else { + throw ParseError.invalidEscape(char) + } + return .char(char) } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 6e511767a..69a3a785b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -544,9 +544,8 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) - // These follow the PCRE behavior. + // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) - parseTest(#"\E"#, "E") parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) @@ -566,6 +565,16 @@ extension RegexTests { parseTest(#"["-"]"#, charClass(range_m("\"", "\""))) + // MARK: Escapes + + // Not metachars, but we allow their escape as ASCII. + parseTest(#"\<"#, "<") + parseTest(#"\ "#, " ") + parseTest(#"\\"#, "\\") + + // Escaped U+3000 IDEOGRAPHIC SPACE. + parseTest(#"\\#u{3000}"#, "\u{3000}") + // MARK: Comments parseTest( @@ -989,13 +998,6 @@ extension RegexTests { // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) - parseTest(#"[\g]"#, charClass("g")) - parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0")) - parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}")) - parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'")) - - parseTest(#"\g"#, atom(.char("g"))) - parseTest(#"\k"#, atom(.char("k"))) // MARK: Character names. @@ -1526,7 +1528,7 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) parseWithDelimitersTest(#"re'šŸ”„šŸ‡©šŸ‡°'"#, concat("šŸ”„", "šŸ‡©šŸ‡°")) - parseWithDelimitersTest(#"re'\šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) + parseWithDelimitersTest(#"re'šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) @@ -1875,6 +1877,26 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + // TODO: Custom diagnostic for expected backref + diagnosticTest(#"\g"#, .invalidEscape("g")) + diagnosticTest(#"\k"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for backref in custom char class + diagnosticTest(#"[\g]"#, .invalidEscape("g")) + diagnosticTest(#"[\g+30]"#, .invalidEscape("g")) + diagnosticTest(#"[\g{1}]"#, .invalidEscape("g")) + diagnosticTest(#"[\k'a']"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for missing '\Q' + diagnosticTest(#"\E"#, .invalidEscape("E")) + + // Non-ASCII non-whitespace cases. + diagnosticTest(#"\šŸ”„"#, .invalidEscape("šŸ”„")) + diagnosticTest(#"\šŸ‡©šŸ‡°"#, .invalidEscape("šŸ‡©šŸ‡°")) + diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) + diagnosticTest(#"\\#u{E9}"#, .invalidEscape("Ć©")) + diagnosticTest(#"\Ė‚"#, .invalidEscape("Ė‚")) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 5a52d531097d3e67daf2c7af3a863d66aaf6388f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:32 +0000 Subject: [PATCH 2/4] Allow certain escape sequences in character class ranges Certain escape sequences express a unicode scalar and as such are valid in a range. --- Sources/_RegexParser/Regex/AST/Atom.swift | 60 ++++++++++++++++++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 5 +- Tests/RegexTests/MatchTests.swift | 29 ++++++++++ Tests/RegexTests/ParseTests.swift | 31 ++++++++++ 4 files changed, 118 insertions(+), 7 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index bc346469b..0aa0951c5 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -641,17 +641,67 @@ extension AST.Atom { case .scalar(let s): return Character(s) + case .escaped(let c): + switch c { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: - // TODO: Not a character per-say, what should we do? - fallthrough + // TODO: These should have unicode scalar values. + return nil - case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + case .namedCharacter: + // TODO: This should have a unicode scalar value depending on the name + // given. + // TODO: Do we want to validate and assign a scalar value when building + // the AST? Or defer for the matching engine? + return nil + + case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, + .callout, .backtrackingDirective: return nil } } + /// Whether this atom is valid as the operand of a custom character class + /// range. + public var isValidCharacterClassRangeBound: Bool { + // If we have a literal character value for this, it can be used as a bound. + if literalCharacterValue != nil { return true } + switch kind { + // \cx, \C-x, \M-x, \M-\C-x, \N{...} + case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: + return true + default: + return false + } + } + /// Produce a string literal representation of the atom, if possible /// /// Individual characters will be returned, Unicode scalars will be diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 296956fdc..4481cf602 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -489,10 +489,11 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.literalCharacterValue != nil && - rhs.literalCharacterValue != nil else { + guard atom.isValidCharacterClassRangeBound && + rhs.isValidCharacterClassRangeBound else { throw ParseError.invalidCharacterClassRangeOperand } + // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 52db17aa7..67412d262 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -594,6 +594,35 @@ extension RegexTests { firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α") + func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } + + // Currently not supported in the matching engine. + for s in scalar("\u{C}") ... scalar("\u{1B}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", + xfail: true) + } + for u: UnicodeScalar in ["\u{7}", "\u{8}"] { + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", + xfail: true) + } + for s in scalar("\u{A}") ... scalar("\u{D}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", + xfail: true) + } + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", + xfail: true) + + for c: UnicodeScalar in ["a", "b", "c"] { + firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", + xfail: true) + } + for c: UnicodeScalar in ["$", "%", "&", "'"] { + firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, + input: "#()\(c)", match: "\(c)", xfail: true) + } + // MARK: Operators firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 69a3a785b..76327ac64 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -494,6 +494,25 @@ extension RegexTests { parseTest("[*]", charClass("*")) parseTest("[{0}]", charClass("{", "0", "}")) + parseTest(#"[\f-\e]"#, charClass( + range_m(.escaped(.formfeed), .escaped(.escape)))) + parseTest(#"[\a-\b]"#, charClass( + range_m(.escaped(.alarm), .escaped(.backspace)))) + parseTest(#"[\n-\r]"#, charClass( + range_m(.escaped(.newline), .escaped(.carriageReturn)))) + parseTest(#"[\t-\t]"#, charClass( + range_m(.escaped(.tab), .escaped(.tab)))) + + parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass( + range_m(.keyboardControl("X"), .keyboardControl("Y")), + range_m(.keyboardControl("A"), .keyboardControl("B")), + range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), + range_m(.keyboardMeta("A"), .keyboardMeta("B")) + )) + + parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + // MARK: Operators parseTest( @@ -575,6 +594,15 @@ extension RegexTests { // Escaped U+3000 IDEOGRAPHIC SPACE. parseTest(#"\\#u{3000}"#, "\u{3000}") + // Control and meta controls. + parseTest(#"\c "#, atom(.keyboardControl(" "))) + parseTest(#"\c!"#, atom(.keyboardControl("!"))) + parseTest(#"\c~"#, atom(.keyboardControl("~"))) + parseTest(#"\C--"#, atom(.keyboardControl("-"))) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) + parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + // MARK: Comments parseTest( @@ -1877,6 +1905,9 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + // TODO: Custom diagnostic for control sequence + diagnosticTest(#"\c"#, .unexpectedEndOfInput) + // TODO: Custom diagnostic for expected backref diagnosticTest(#"\g"#, .invalidEscape("g")) diagnosticTest(#"\k"#, .invalidEscape("k")) From 692f0fd15bbced7f347ed8b99021d9ad45148369 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:33 +0000 Subject: [PATCH 3/4] Remove obsolete CharacterClass model computation This is now done from the DSLTree. --- .../_StringProcessing/CharacterClass.swift | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 0b95e08b4..d44fa9fb2 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -319,21 +319,6 @@ extension CharacterClass { } } -extension AST.Node { - /// If this has a character class representation, whether built-in or custom, return it. - /// - /// TODO: Not sure if this the right model type, but I suspect we'll want to produce - /// something like this on demand - var characterClass: CharacterClass? { - switch self { - case let .customCharacterClass(cc): return cc.modelCharacterClass - case let .atom(a): return a.characterClass - - default: return nil - } - } -} - extension DSLTree.Node { var characterClass: CharacterClass? { switch self { @@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - /// The model character class for this custom character class. - var modelCharacterClass: CharacterClass? { - typealias Component = CharacterClass.CharacterSetComponent - func getComponents(_ members: [Member]) -> [Component]? { - var result = Array() - for m in members { - switch m { - case .custom(let cc): - guard let cc = cc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - case .range(let r): - result.append(.range( - r.lhs.literalCharacterValue! ... - r.rhs.literalCharacterValue!)) - - case .atom(let a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let lit = a.literalCharacterValue { - result.append(.character(lit)) - } else { - return nil - } - - case .quote(let q): - // Decompose quoted literal into literal characters. - result += q.literal.map { .character($0) } - - case .trivia: - // Not semantically important. - break - - case .setOperation(let lhs, let op, let rhs): - // FIXME: CharacterClass wasn't designed for set operations with - // multiple components in each operand, we should fix that. For now, - // just produce custom components. - guard let lhs = getComponents(lhs), - let rhs = getComponents(rhs) - else { - return nil - } - result.append(.setOperation(.init( - lhs: .characterClass(.custom(lhs)), - op: op.value, - rhs: .characterClass(.custom(rhs))))) - } - } - return result - } - guard let comps = getComponents(members) else { - return nil - } - let cc = CharacterClass.custom(comps) - return self.isInverted ? cc.inverted : cc - } -} - extension CharacterClass { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the From cdf98c5f94bf159450015cc72e675a0930b9dd36 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:33 +0000 Subject: [PATCH 4/4] Forbid empty character classes As per PCRE, Oniguruma, and ICU, a first character of `]` is treated as literal. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 4481cf602..7867073e6 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -425,6 +425,12 @@ extension Parser { try source.expectNonEmpty() var members: Array = [] + + // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty + // character classes, and assume an initial ']' is literal. + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + } try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 76327ac64..f6f31c075 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -428,6 +428,10 @@ extension RegexTests { parseTest("[-]", charClass("-")) + // Empty character classes are forbidden, therefore this is a character + // class of literal ']'. + parseTest("[]]", charClass("]")) + // These are metacharacters in certain contexts, but normal characters // otherwise. parseTest( @@ -1901,6 +1905,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // The first ']' of a custom character class is literal, so this is missing + // the closing bracket. + diagnosticTest("[]", .expected("]")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape)