diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index bc346469b..0aa0951c5 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -641,17 +641,67 @@ extension AST.Atom { case .scalar(let s): return Character(s) + case .escaped(let c): + switch c { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: - // TODO: Not a character per-say, what should we do? - fallthrough + // TODO: These should have unicode scalar values. + return nil - case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + case .namedCharacter: + // TODO: This should have a unicode scalar value depending on the name + // given. + // TODO: Do we want to validate and assign a scalar value when building + // the AST? Or defer for the matching engine? + return nil + + case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, + .callout, .backtrackingDirective: return nil } } + /// Whether this atom is valid as the operand of a custom character class + /// range. + public var isValidCharacterClassRangeBound: Bool { + // If we have a literal character value for this, it can be used as a bound. + if literalCharacterValue != nil { return true } + switch kind { + // \cx, \C-x, \M-x, \M-\C-x, \N{...} + case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: + return true + default: + return false + } + } + /// Produce a string literal representation of the atom, if possible /// /// Individual characters will be returned, Unicode scalars will be diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index b9c99d9d3..d4c809045 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -39,6 +39,7 @@ enum ParseError: Error, Hashable { case expectedNonEmptyContents case expectedEscape + case invalidEscape(Character) case cannotReferToWholePattern @@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible { return "expected non-empty contents" case .expectedEscape: return "expected escape sequence" + case .invalidEscape(let c): + return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .notQuantifiable: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index cfab75312..4eb0ebea4 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1489,8 +1489,17 @@ extension Source { return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: - return .char(char) + break } + + // We only allow unknown escape sequences for non-letter ASCII, and + // non-ASCII whitespace. + guard (char.isASCII && !char.isLetter) || + (!char.isASCII && char.isWhitespace) + else { + throw ParseError.invalidEscape(char) + } + return .char(char) } } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 296956fdc..7867073e6 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -425,6 +425,12 @@ extension Parser { try source.expectNonEmpty() var members: Array = [] + + // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty + // character classes, and assume an initial ']' is literal. + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + } try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note @@ -489,10 +495,11 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.literalCharacterValue != nil && - rhs.literalCharacterValue != nil else { + guard atom.isValidCharacterClassRangeBound && + rhs.isValidCharacterClassRangeBound else { throw ParseError.invalidCharacterClassRangeOperand } + // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 0b95e08b4..d44fa9fb2 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -319,21 +319,6 @@ extension CharacterClass { } } -extension AST.Node { - /// If this has a character class representation, whether built-in or custom, return it. - /// - /// TODO: Not sure if this the right model type, but I suspect we'll want to produce - /// something like this on demand - var characterClass: CharacterClass? { - switch self { - case let .customCharacterClass(cc): return cc.modelCharacterClass - case let .atom(a): return a.characterClass - - default: return nil - } - } -} - extension DSLTree.Node { var characterClass: CharacterClass? { switch self { @@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - /// The model character class for this custom character class. - var modelCharacterClass: CharacterClass? { - typealias Component = CharacterClass.CharacterSetComponent - func getComponents(_ members: [Member]) -> [Component]? { - var result = Array() - for m in members { - switch m { - case .custom(let cc): - guard let cc = cc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - case .range(let r): - result.append(.range( - r.lhs.literalCharacterValue! ... - r.rhs.literalCharacterValue!)) - - case .atom(let a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let lit = a.literalCharacterValue { - result.append(.character(lit)) - } else { - return nil - } - - case .quote(let q): - // Decompose quoted literal into literal characters. - result += q.literal.map { .character($0) } - - case .trivia: - // Not semantically important. - break - - case .setOperation(let lhs, let op, let rhs): - // FIXME: CharacterClass wasn't designed for set operations with - // multiple components in each operand, we should fix that. For now, - // just produce custom components. - guard let lhs = getComponents(lhs), - let rhs = getComponents(rhs) - else { - return nil - } - result.append(.setOperation(.init( - lhs: .characterClass(.custom(lhs)), - op: op.value, - rhs: .characterClass(.custom(rhs))))) - } - } - return result - } - guard let comps = getComponents(members) else { - return nil - } - let cc = CharacterClass.custom(comps) - return self.isInverted ? cc.inverted : cc - } -} - extension CharacterClass { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 52db17aa7..67412d262 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -594,6 +594,35 @@ extension RegexTests { firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α") + func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } + + // Currently not supported in the matching engine. + for s in scalar("\u{C}") ... scalar("\u{1B}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", + xfail: true) + } + for u: UnicodeScalar in ["\u{7}", "\u{8}"] { + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", + xfail: true) + } + for s in scalar("\u{A}") ... scalar("\u{D}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", + xfail: true) + } + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", + xfail: true) + + for c: UnicodeScalar in ["a", "b", "c"] { + firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", + xfail: true) + } + for c: UnicodeScalar in ["$", "%", "&", "'"] { + firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, + input: "#()\(c)", match: "\(c)", xfail: true) + } + // MARK: Operators firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 6e511767a..f6f31c075 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -428,6 +428,10 @@ extension RegexTests { parseTest("[-]", charClass("-")) + // Empty character classes are forbidden, therefore this is a character + // class of literal ']'. + parseTest("[]]", charClass("]")) + // These are metacharacters in certain contexts, but normal characters // otherwise. parseTest( @@ -494,6 +498,25 @@ extension RegexTests { parseTest("[*]", charClass("*")) parseTest("[{0}]", charClass("{", "0", "}")) + parseTest(#"[\f-\e]"#, charClass( + range_m(.escaped(.formfeed), .escaped(.escape)))) + parseTest(#"[\a-\b]"#, charClass( + range_m(.escaped(.alarm), .escaped(.backspace)))) + parseTest(#"[\n-\r]"#, charClass( + range_m(.escaped(.newline), .escaped(.carriageReturn)))) + parseTest(#"[\t-\t]"#, charClass( + range_m(.escaped(.tab), .escaped(.tab)))) + + parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass( + range_m(.keyboardControl("X"), .keyboardControl("Y")), + range_m(.keyboardControl("A"), .keyboardControl("B")), + range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), + range_m(.keyboardMeta("A"), .keyboardMeta("B")) + )) + + parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + // MARK: Operators parseTest( @@ -544,9 +567,8 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) - // These follow the PCRE behavior. + // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) - parseTest(#"\E"#, "E") parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) @@ -566,6 +588,25 @@ extension RegexTests { parseTest(#"["-"]"#, charClass(range_m("\"", "\""))) + // MARK: Escapes + + // Not metachars, but we allow their escape as ASCII. + parseTest(#"\<"#, "<") + parseTest(#"\ "#, " ") + parseTest(#"\\"#, "\\") + + // Escaped U+3000 IDEOGRAPHIC SPACE. + parseTest(#"\\#u{3000}"#, "\u{3000}") + + // Control and meta controls. + parseTest(#"\c "#, atom(.keyboardControl(" "))) + parseTest(#"\c!"#, atom(.keyboardControl("!"))) + parseTest(#"\c~"#, atom(.keyboardControl("~"))) + parseTest(#"\C--"#, atom(.keyboardControl("-"))) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) + parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + // MARK: Comments parseTest( @@ -989,13 +1030,6 @@ extension RegexTests { // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) - parseTest(#"[\g]"#, charClass("g")) - parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0")) - parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}")) - parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'")) - - parseTest(#"\g"#, atom(.char("g"))) - parseTest(#"\k"#, atom(.char("k"))) // MARK: Character names. @@ -1526,7 +1560,7 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) parseWithDelimitersTest(#"re'šŸ”„šŸ‡©šŸ‡°'"#, concat("šŸ”„", "šŸ‡©šŸ‡°")) - parseWithDelimitersTest(#"re'\šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) + parseWithDelimitersTest(#"re'šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) @@ -1871,10 +1905,37 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // The first ']' of a custom character class is literal, so this is missing + // the closing bracket. + diagnosticTest("[]", .expected("]")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) + // TODO: Custom diagnostic for control sequence + diagnosticTest(#"\c"#, .unexpectedEndOfInput) + + // TODO: Custom diagnostic for expected backref + diagnosticTest(#"\g"#, .invalidEscape("g")) + diagnosticTest(#"\k"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for backref in custom char class + diagnosticTest(#"[\g]"#, .invalidEscape("g")) + diagnosticTest(#"[\g+30]"#, .invalidEscape("g")) + diagnosticTest(#"[\g{1}]"#, .invalidEscape("g")) + diagnosticTest(#"[\k'a']"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for missing '\Q' + diagnosticTest(#"\E"#, .invalidEscape("E")) + + // Non-ASCII non-whitespace cases. + diagnosticTest(#"\šŸ”„"#, .invalidEscape("šŸ”„")) + diagnosticTest(#"\šŸ‡©šŸ‡°"#, .invalidEscape("šŸ‡©šŸ‡°")) + diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) + diagnosticTest(#"\\#u{E9}"#, .invalidEscape("Ć©")) + diagnosticTest(#"\Ė‚"#, .invalidEscape("Ė‚")) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)