From 7b35b67aa9381e2a6a87bf4b090a5feeb0162f2e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:01 +0100 Subject: [PATCH 1/6] Improve a diagnostic message --- Sources/_RegexParser/Regex/Parse/Diagnostics.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 5bca2ad13..abb4afe56 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -179,7 +179,7 @@ extension ParseError: CustomStringConvertible { case .unsupportedDotNetSubtraction: return "subtraction with '-' is unsupported; use '--' instead" case .emptyProperty: - return "empty property" + return "expected property name" case .unknownProperty(let key, let value): if let key = key { return "unknown character property '\(key)=\(value)'" From 30623df819d58bae22a1961308a2c4073d8062a1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:01 +0100 Subject: [PATCH 2/6] Introduce AST.Atom.Number This stores both a source location, and has the ability to be `nil`, which is necessary to enable parser recovery in cases where we expect a number but parse something that e.g overflows. --- Sources/_RegexParser/Regex/AST/AST.swift | 25 +++- Sources/_RegexParser/Regex/AST/Atom.swift | 14 +- .../_RegexParser/Regex/AST/Conditional.swift | 8 +- .../Regex/AST/MatchingOptions.swift | 6 +- .../Regex/AST/Quantification.swift | 16 +-- .../Regex/Parse/LexicalAnalysis.swift | 69 ++++----- Sources/_RegexParser/Regex/Parse/Parse.swift | 2 + Sources/_RegexParser/Regex/Parse/Sema.swift | 9 +- .../_RegexParser/Regex/Printing/DumpAST.swift | 16 ++- .../Regex/Printing/PrintAsCanonical.swift | 18 ++- Sources/_StringProcessing/ByteCodeGen.swift | 8 +- .../_StringProcessing/PrintAsPattern.swift | 14 +- Sources/_StringProcessing/Regex/DSLTree.swift | 8 +- .../Utility/ASTBuilder.swift | 71 +++++---- Tests/RegexTests/ParseTests.swift | 136 +++++++++--------- 15 files changed, 252 insertions(+), 168 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index 44bc10828..97051db10 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -265,12 +265,12 @@ extension AST { public enum Kind: Hashable { // \n \gn \g{n} \g \g'n' (?n) (?(n)... // Oniguruma: \k, \k'n' - case absolute(Int) + case absolute(AST.Atom.Number) // \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n) // (?(+n)... (?(-n)... // Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n' - case relative(Int) + case relative(AST.Atom.Number) // \k \k'name' \g{name} \k{name} (?P=name) // \g \g'name' (?&name) (?P>name) @@ -278,20 +278,33 @@ extension AST { case named(String) /// (?R), (?(R)..., which are equivalent to (?0), (?(0)... - static var recurseWholePattern: Kind { .absolute(0) } + static func recurseWholePattern(_ loc: SourceLocation) -> Kind { + .absolute(.init(0, at: loc)) + } + + /// Whether this is a reference that recurses the whole pattern, rather + /// than a group. + public var recursesWholePattern: Bool { + switch self { + case .absolute(let a): + return a.value == 0 + default: + return false + } + } } public var kind: Kind /// An additional specifier supported by Oniguruma that specifies what /// recursion level the group being referenced belongs to. - public var recursionLevel: Located? + public var recursionLevel: AST.Atom.Number? /// The location of the inner numeric or textual reference, e.g the location /// of '-2' in '\g{-2}'. Note this includes the recursion level for e.g /// '\k'. public var innerLoc: SourceLocation - public init(_ kind: Kind, recursionLevel: Located? = nil, + public init(_ kind: Kind, recursionLevel: AST.Atom.Number? = nil, innerLoc: SourceLocation) { self.kind = kind self.recursionLevel = recursionLevel @@ -300,7 +313,7 @@ extension AST { /// Whether this is a reference that recurses the whole pattern, rather than /// a group. - public var recursesWholePattern: Bool { kind == .recurseWholePattern } + public var recursesWholePattern: Bool { kind.recursesWholePattern } } /// A set of global matching options in a regular expression literal. diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index a349c2a85..2d7ed61cc 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -113,6 +113,18 @@ extension AST.Atom { } extension AST.Atom { + public struct Number: Hashable { + /// The value, which may be `nil` in an invalid AST, e.g the parser expected + /// a number at a given location, or the parsed number overflowed. + public var value: Int? + public var location: SourceLocation + + public init(_ value: Int?, at location: SourceLocation) { + self.value = value + self.location = location + } + } + public struct Scalar: Hashable { public var value: UnicodeScalar public var location: SourceLocation @@ -558,7 +570,7 @@ extension AST.Atom { /// A PCRE callout written `(?C...)` public struct PCRE: Hashable { public enum Argument: Hashable { - case number(Int) + case number(AST.Atom.Number) case string(String) } public var arg: AST.Located diff --git a/Sources/_RegexParser/Regex/AST/Conditional.swift b/Sources/_RegexParser/Regex/AST/Conditional.swift index c382a25b6..3a9a43be8 100644 --- a/Sources/_RegexParser/Regex/AST/Conditional.swift +++ b/Sources/_RegexParser/Regex/AST/Conditional.swift @@ -66,11 +66,13 @@ extension AST.Conditional { extension AST.Conditional.Condition { public struct PCREVersionNumber: Hashable { - public var major: Int - public var minor: Int + public var major: AST.Atom.Number + public var minor: AST.Atom.Number public var location: SourceLocation - public init(major: Int, minor: Int, _ location: SourceLocation) { + public init( + major: AST.Atom.Number, minor: AST.Atom.Number, _ location: SourceLocation + ) { self.major = major self.minor = minor self.location = location diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index db813b407..c85c2b3d1 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -175,13 +175,13 @@ extension AST { } public enum Kind: Hashable { /// (*LIMIT_DEPTH=d) - case limitDepth(Located) + case limitDepth(AST.Atom.Number) /// (*LIMIT_HEAP=d) - case limitHeap(Located) + case limitHeap(AST.Atom.Number) /// (*LIMIT_MATCH=d) - case limitMatch(Located) + case limitMatch(AST.Atom.Number) /// (*NOTEMPTY) case notEmpty diff --git a/Sources/_RegexParser/Regex/AST/Quantification.swift b/Sources/_RegexParser/Regex/AST/Quantification.swift index c6d4f0101..7bc2e6620 100644 --- a/Sources/_RegexParser/Regex/AST/Quantification.swift +++ b/Sources/_RegexParser/Regex/AST/Quantification.swift @@ -37,13 +37,13 @@ extension AST { } public enum Amount: Hashable { - case zeroOrMore // * - case oneOrMore // + - case zeroOrOne // ? - case exactly(Located) // {n} - case nOrMore(Located) // {n,} - case upToN(Located) // {,n} - case range(Located, Located) // {n,m} + case zeroOrMore // * + case oneOrMore // + + case zeroOrOne // ? + case exactly(AST.Atom.Number) // {n} + case nOrMore(AST.Atom.Number) // {n,} + case upToN(AST.Atom.Number) // {,n} + case range(AST.Atom.Number, AST.Atom.Number) // {n,m} } public enum Kind: String, Hashable { @@ -58,7 +58,7 @@ extension AST { extension AST.Quantification.Amount { /// The bounds. - public var bounds: (atLeast: Int, atMost: Int?) { + public var bounds: (atLeast: Int?, atMost: Int?) { switch self { case .zeroOrMore: return (0, nil) case .oneOrMore: return (1, nil) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index be6f13fc7..691d8fa4a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -253,18 +253,18 @@ extension Source { /// /// Throws on overflow /// - private mutating func lexNumber( - _ ty: Num.Type, _ kind: RadixKind - ) throws -> Located? { + private mutating func lexNumber( + _ kind: RadixKind + ) throws -> AST.Atom.Number? { try recordLoc { src in - guard let str = src.tryEatPrefix(kind.characterFilter)?.string else { + guard let str = src.tryEatLocatedPrefix(kind.characterFilter) else { return nil } - guard let i = Num(str, radix: kind.radix) else { - throw ParseError.numberOverflow(str) + guard let i = Int(str.value, radix: kind.radix) else { + throw ParseError.numberOverflow(str.value) } - return i - } + return .init(i, at: str.location) + }.value } /// Try to eat a number off the front. @@ -273,11 +273,11 @@ extension Source { /// /// Throws on overflow /// - mutating func lexNumber() throws -> Located? { - try lexNumber(Int.self, .decimal) + mutating func lexNumber() throws -> AST.Atom.Number? { + try lexNumber(.decimal) } - mutating func expectNumber() throws -> Located { + mutating func expectNumber() throws -> AST.Atom.Number { guard let num = try lexNumber() else { throw ParseError.expectedNumber("", kind: .decimal) } @@ -488,9 +488,10 @@ extension Source { if let t = src.lexWhitespace() { trivia.append(t) } - let upperOpt = try src.lexNumber()?.map { upper in + var upperOpt = try src.lexNumber() + if closedRange == false { // If we have an open range, the upper bound should be adjusted down. - closedRange == true ? upper : upper - 1 + upperOpt?.value? -= 1 } if let t = src.lexWhitespace() { trivia.append(t) } @@ -1066,10 +1067,11 @@ extension Source { /// private mutating func expectPCREVersionNumber( ) throws -> AST.Conditional.Condition.PCREVersionNumber { - let nums = try recordLoc { src -> (major: Int, minor: Int) in - let major = try src.expectNumber().value + let nums = try recordLoc { src -> (major: AST.Atom.Number, + minor: AST.Atom.Number) in + let major = try src.expectNumber() try src.expect(".") - let minor = try src.expectNumber().value + let minor = try src.expectNumber() return (major, minor) } return .init(major: nums.value.major, minor: nums.value.minor, @@ -1119,7 +1121,7 @@ extension Source { } if let num = try src.lexNumber() { return .groupRecursionCheck( - .init(.absolute(num.value), innerLoc: num.location)) + .init(.absolute(num), innerLoc: num.location)) } return .recursionCheck } @@ -1406,20 +1408,21 @@ extension Source { let kind = try recordLoc { src -> AST.Reference.Kind? in try src.tryEating { src in // Note this logic should match canLexNumberedReference. - if src.tryEat("+"), let num = try src.lexNumber() { - return .relative(num.value) + if let plus = src.tryEatWithLoc("+"), let num = try src.lexNumber() { + return .relative(.init(num.value, at: num.location.union(with: plus))) } - if src.tryEat("-"), let num = try src.lexNumber() { - return .relative(-num.value) + if let minus = src.tryEatWithLoc("-"), let num = try src.lexNumber() { + let val = num.value.map { x in -x } + return .relative(.init(val, at: num.location.union(with: minus))) } if let num = try src.lexNumber() { - return .absolute(num.value) + return .absolute(num) } return nil } } guard let kind = kind else { return nil } - guard allowWholePatternRef || kind.value != .recurseWholePattern else { + guard allowWholePatternRef || !kind.value.recursesWholePattern else { throw ParseError.cannotReferToWholePattern } let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil @@ -1432,12 +1435,14 @@ extension Source { /// RecursionLevel -> '+' | '-' /// private mutating func lexRecursionLevel( - ) throws -> Located? { - try recordLoc { src in + ) throws -> AST.Atom.Number? { + let value = try recordLoc { src -> Int? in if src.tryEat("+") { return try src.expectNumber().value } - if src.tryEat("-") { return try -src.expectNumber().value } + if src.tryEat("-") { return try src.expectNumber().value.map { x in -x } } return nil } + guard let value = value else { return nil } + return .init(value.value, at: value.location) } /// Checks whether a numbered reference can be lexed. @@ -1579,9 +1584,8 @@ extension Source { } // Backslash followed by a non-0 digit character is a backreference. - if firstChar != "0", let numAndLoc = try src.lexNumber() { - return .backreference(.init( - .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) + if firstChar != "0", let num = try src.lexNumber() { + return .backreference(.init(.absolute(num), innerLoc: num.location)) } return nil } @@ -1621,7 +1625,7 @@ extension Source { // Whole-pattern recursion, which is equivalent to (?0). if let loc = src.tryEatWithLoc("R") { try src.expect(")") - return .subpattern(.init(.recurseWholePattern, innerLoc: loc)) + return .subpattern(.init(.recurseWholePattern(loc), innerLoc: loc)) } // Numbered subpattern reference. @@ -1772,11 +1776,12 @@ extension Source { let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in // Parse '(?C' followed by a number. if let num = try src.lexNumber() { - return .number(num.value) + return .number(num) } // '(?C)' is implicitly '(?C0)'. if src.peek() == ")" { - return .number(0) + let pos = src.currentPosition + return .number(.init(0, at: SourceLocation(pos ..< pos))) } // Parse '(C?' followed by a set of balanced delimiters as defined by // http://pcre.org/current/doc/html/pcre2pattern.html#SEC28 diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 52861f23d..389242614 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -96,8 +96,10 @@ struct ParsingContext { func isPriorGroupRef(_ ref: AST.Reference.Kind) -> Bool { switch ref { case .absolute(let i): + guard let i = i.value else { return false } return i <= priorGroupCount case .relative(let i): + guard let i = i.value else { return false } return i < 0 case .named(let str): return usedGroupNames.contains(str) diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index c49436702..269f0ee01 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -76,7 +76,8 @@ extension RegexValidator { throw error(.unsupported("recursion level"), at: recLevel.location) } switch ref.kind { - case .absolute(let i): + case .absolute(let num): + guard let i = num.value else { break } guard i < captures.captures.count else { throw error(.invalidReference(i), at: ref.innerLoc) } @@ -359,9 +360,9 @@ extension RegexValidator { } switch quant.amount.value { case .range(let lhs, let rhs): - guard lhs.value <= rhs.value else { - throw error( - .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + guard let lhs = lhs.value, let rhs = rhs.value else { break } + guard lhs <= rhs else { + throw error(.invalidQuantifierRange(lhs, rhs), at: quant.location) } case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: break diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 10e50d712..640cf5559 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -173,6 +173,12 @@ extension AST.Atom { } } +extension AST.Atom.Number: _ASTPrintable { + public var _dumpBase: String { + value.map { "\($0)" } ?? "" + } +} + extension AST.Atom.Callout: _ASTPrintable { public var _dumpBase: String { switch self { @@ -227,7 +233,7 @@ extension AST.Reference: _ASTPrintable { public var _dumpBase: String { var result = "\(kind)" if let recursionLevel = recursionLevel { - result += "\(recursionLevel.value)" + result += "\(recursionLevel)" } return result } @@ -270,11 +276,11 @@ extension AST.Quantification.Amount: _ASTPrintable { case .zeroOrMore: return "zeroOrMore" case .oneOrMore: return "oneOrMore" case .zeroOrOne: return "zeroOrOne" - case let .exactly(n): return "exactly<\(n.value)>" - case let .nOrMore(n): return "nOrMore<\(n.value)>" - case let .upToN(n): return "uptoN<\(n.value)>" + case let .exactly(n): return "exactly<\(n)>" + case let .nOrMore(n): return "nOrMore<\(n)>" + case let .upToN(n): return "uptoN<\(n)>" case let .range(lower, upper): - return ".range<\(lower.value)...\(upper.value)>" + return ".range<\(lower)...\(upper)>" } } } diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index ac553a115..be7b98991 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -217,9 +217,9 @@ extension AST.Quantification.Amount { case .zeroOrMore: return "*" case .oneOrMore: return "+" case .zeroOrOne: return "?" - case let .exactly(n): return "{\(n.value)}" - case let .nOrMore(n): return "{\(n.value),}" - case let .upToN(n): return "{,\(n.value)}" + case let .exactly(n): return "{\(n._canonicalBase)}" + case let .nOrMore(n): return "{\(n._canonicalBase),}" + case let .upToN(n): return "{,\(n._canonicalBase)}" case let .range(lower, upper): return "{\(lower),\(upper)}" } @@ -229,6 +229,12 @@ extension AST.Quantification.Kind { var _canonicalBase: String { self.rawValue } } +extension AST.Atom.Number { + var _canonicalBase: String { + value.map { "\($0)" } ?? "<#number#>" + } +} + extension AST.Atom { var _canonicalBase: String { if let anchor = self.assertionKind { @@ -305,9 +311,9 @@ extension AST.GlobalMatchingOption.NewlineSequenceMatching { extension AST.GlobalMatchingOption.Kind { var _canonicalBase: String { switch self { - case .limitDepth(let i): return "LIMIT_DEPTH=\(i.value)" - case .limitHeap(let i): return "LIMIT_HEAP=\(i.value)" - case .limitMatch(let i): return "LIMIT_MATCH=\(i.value)" + case .limitDepth(let i): return "LIMIT_DEPTH=\(i._canonicalBase)" + case .limitHeap(let i): return "LIMIT_HEAP=\(i._canonicalBase)" + case .limitMatch(let i): return "LIMIT_MATCH=\(i._canonicalBase)" case .notEmpty: return "NOTEMPTY" case .notEmptyAtStart: return "NOTEMPTY_ATSTART" case .noAutoPossess: return "NO_AUTO_POSSESS" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3cfbdcbd1..820a4c721 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -84,7 +84,10 @@ fileprivate extension Compiler.ByteCodeGen { } switch ref.kind { - case .absolute(let i): + case .absolute(let n): + guard let i = n.value else { + throw Unreachable("Expected a value") + } builder.buildBackreference(.init(i)) case .named(let name): try builder.buildNamedReference(name) @@ -459,6 +462,9 @@ fileprivate extension Compiler.ByteCodeGen { } let (low, high) = amount.bounds + guard let low = low else { + throw Unreachable("Must have a lower bound") + } switch (low, high) { case (_, 0): // TODO: Should error out earlier, maybe DSL and parser diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 2fe7c6ccc..049a91ce0 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -973,16 +973,22 @@ extension AST.Atom { } } +extension AST.Atom.Number { + var _patternBase: String { + value.map { "\($0)" } ?? "<#number#>" + } +} + extension AST.Quantification.Amount { var _patternBase: String { switch self { case .zeroOrMore: return "ZeroOrMore" case .oneOrMore: return "OneOrMore" case .zeroOrOne: return "Optionally" - case let .exactly(n): return "Repeat(count: \(n.value))" - case let .nOrMore(n): return "Repeat(\(n.value)...)" - case let .upToN(n): return "Repeat(...\(n.value))" - case let .range(n, m): return "Repeat(\(n.value)...\(m.value))" + case let .exactly(n): return "Repeat(count: \(n._patternBase))" + case let .nOrMore(n): return "Repeat(\(n._patternBase)...)" + case let .upToN(n): return "Repeat(...\(n._patternBase))" + case let .range(n, m): return "Repeat(\(n._patternBase)...\(m._patternBase))" } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index c4d6d46e3..e214f92f1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -765,16 +765,16 @@ extension DSLTree { .init(ast: .zeroOrOne) } public static func exactly(_ n: Int) -> Self { - .init(ast: .exactly(.init(faking: n))) + .init(ast: .exactly(.init(n, at: .fake))) } public static func nOrMore(_ n: Int) -> Self { - .init(ast: .nOrMore(.init(faking: n))) + .init(ast: .nOrMore(.init(n, at: .fake))) } public static func upToN(_ n: Int) -> Self { - .init(ast: .upToN(.init(faking: n))) + .init(ast: .upToN(.init(n, at: .fake))) } public static func range(_ lower: Int, _ upper: Int) -> Self { - .init(ast: .range(.init(faking: lower), .init(faking: upper))) + .init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake))) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 49a08430d..c3f3423ef 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -154,20 +154,39 @@ func unsetMatchingOptions( unsetMatchingOptions(adding: adding) } -func ref(_ i: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.absolute(i), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(_ n: Int?) -> AST.Reference.Kind { + .absolute(.init(n, at: .fake)) } -func ref(plus n: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.relative(n), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(plus n: Int?) -> AST.Reference.Kind { + .relative(.init(n, at: .fake)) } -func ref(minus n: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.relative(-n), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(minus n: Int?) -> AST.Reference.Kind { + .relative(.init(n.map { x in -x }, at: .fake)) +} +func ref(named n: String) -> AST.Reference.Kind { + .named(n) +} + +func ref(_ n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) +} +func ref(plus n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(plus: n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) +} +func ref(minus n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(minus: n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) } func ref(_ s: String, recursionLevel: Int? = nil) -> AST.Reference { - .init(.named(s), recursionLevel: recursionLevel.map { .init(faking: $0) }, + .init(.named(s), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, innerLoc: .fake) } func conditional( @@ -179,10 +198,11 @@ func conditional( } func pcreVersionCheck( _ kind: AST.Conditional.Condition.PCREVersionCheck.Kind, - _ major: Int, _ minor: Int + _ major: Int?, _ minor: Int? ) -> AST.Conditional.Condition.Kind { .pcreVersionCheck(.init( - .init(faking: kind), .init(major: major, minor: minor, .fake) + .init(faking: kind), .init(major: .init(major, at: .fake), + minor: .init(minor, at: .fake), .fake) )) } func groupCondition( @@ -191,8 +211,11 @@ func groupCondition( .group(.init(.init(faking: kind), child, .fake)) } -func pcreCallout(_ arg: AST.Atom.Callout.PCRE.Argument) -> AST.Node { - atom(.callout(.pcre(.init(.init(faking: arg))))) +func pcreCallout(number: Int?) -> AST.Node { + atom(.callout(.pcre(.init(.init(faking: .number(.init(number, at: .fake))))))) +} +func pcreCallout(string: String) -> AST.Node { + atom(.callout(.pcre(.init(.init(faking: .string(string)))))) } func absentRepeater(_ child: AST.Node) -> AST.Node { @@ -268,34 +291,34 @@ func oneOrMore( quant(.oneOrMore, kind, child) } func exactly( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.exactly(.init(faking: i)), kind, child) + quant(.exactly(.init(i, at: .fake)), kind, child) } func nOrMore( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.nOrMore(.init(faking: i)), kind, child) + quant(.nOrMore(.init(i, at: .fake)), kind, child) } func upToN( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.upToN(.init(faking: i)), kind, child) + quant(.upToN(.init(i, at: .fake)), kind, child) } func quantRange( _ r: ClosedRange, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - let lower = AST.Located(faking: r.lowerBound) - let upper = AST.Located(faking: r.upperBound) - return quant(.range(lower, upper), kind, child) + quant(.range( + .init(r.lowerBound, at: .fake), .init(r.upperBound, at: .fake) + ), kind, child) } func charClass( @@ -370,7 +393,7 @@ func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member { func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { atom(.backreference(.init( - r, recursionLevel: recursionLevel.map { .init(faking: $0) }, innerLoc: .fake + r, recursionLevel: recursionLevel.map { .init($0, at: .fake) }, innerLoc: .fake ))) } func subpattern(_ r: AST.Reference.Kind) -> AST.Node { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 207d7e13d..6b31342eb 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1178,34 +1178,34 @@ extension RegexTests { // \1 ... \9 are always backreferences. for i in 1 ... 9 { - parseTest("\\\(i)", backreference(.absolute(i)), throwsError: .invalid) + parseTest("\\\(i)", backreference(ref(i)), throwsError: .invalid) parseTest( "()()()()()()()()()\\\(i)", concat(Array(repeating: capture(empty()), count: 9) - + [backreference(.absolute(i))]), + + [backreference(ref(i))]), captures: .caps(count: 9) ) } - parseTest(#"\10"#, backreference(.absolute(10)), throwsError: .invalid) - parseTest(#"\18"#, backreference(.absolute(18)), throwsError: .invalid) - parseTest(#"\7777"#, backreference(.absolute(7777)), throwsError: .invalid) - parseTest(#"\91"#, backreference(.absolute(91)), throwsError: .invalid) + parseTest(#"\10"#, backreference(ref(10)), throwsError: .invalid) + parseTest(#"\18"#, backreference(ref(18)), throwsError: .invalid) + parseTest(#"\7777"#, backreference(ref(7777)), throwsError: .invalid) + parseTest(#"\91"#, backreference(ref(91)), throwsError: .invalid) parseTest( #"()()()()()()()()()()\10"#, concat(Array(repeating: capture(empty()), count: 10) - + [backreference(.absolute(10))]), + + [backreference(ref(10))]), captures: .caps(count: 10) ) parseTest( #"()()()()()()()()()\10()"#, concat(Array(repeating: capture(empty()), count: 9) - + [backreference(.absolute(10)), capture(empty())]), + + [backreference(ref(10)), capture(empty())]), captures: .caps(count: 10) ) parseTest(#"()()\10"#, concat( - capture(empty()), capture(empty()), backreference(.absolute(10))), + capture(empty()), capture(empty()), backreference(ref(10))), throwsError: .invalid, captures: [.cap, .cap] ) @@ -1216,21 +1216,21 @@ extension RegexTests { parseTest( // There are 9 capture groups in total here. #"((()()())(()()()))\10"#, concat(capture(concat( - fourCaptures, fourCaptures)), backreference(.absolute(10))), + fourCaptures, fourCaptures)), backreference(ref(10))), throwsError: .invalid, captures: .caps(count: 9) ) parseTest( // There are 10 capture groups in total here. #"((()()())()(()()()))\10"#, concat(capture(concat(fourCaptures, capture(empty()), fourCaptures)), - backreference(.absolute(10))), + backreference(ref(10))), captures: .caps(count: 10) ) parseTest( // There are 10 capture groups in total here. #"((((((((((\10))))))))))"#, capture(capture(capture(capture(capture(capture(capture(capture(capture( - capture(backreference(.absolute(10)))))))))))), + capture(backreference(ref(10)))))))))))), captures: .caps(count: 10) ) @@ -1241,21 +1241,21 @@ extension RegexTests { concat(Array(repeating: capture(empty()), count: 40) + [scalar(" ")]), captures: .caps(count: 40) ) - parseTest(#"\40"#, backreference(.absolute(40)), throwsError: .invalid) + parseTest(#"\40"#, backreference(ref(40)), throwsError: .invalid) parseTest( String(repeating: "()", count: 40) + #"\40"#, concat(Array(repeating: capture(empty()), count: 40) - + [backreference(.absolute(40))]), + + [backreference(ref(40))]), captures: .caps(count: 40) ) - parseTest(#"\7"#, backreference(.absolute(7)), throwsError: .invalid) + parseTest(#"\7"#, backreference(ref(7)), throwsError: .invalid) - parseTest(#"\11"#, backreference(.absolute(11)), throwsError: .invalid) + parseTest(#"\11"#, backreference(ref(11)), throwsError: .invalid) parseTest( String(repeating: "()", count: 12) + #"\11"#, concat(Array(repeating: capture(empty()), count: 12) - + [backreference(.absolute(11))]), + + [backreference(ref(11))]), captures: .caps(count: 12) ) parseTest(#"\011"#, scalar("\u{9}")) @@ -1266,25 +1266,25 @@ extension RegexTests { ) parseTest(#"\0113"#, scalar("\u{4B}")) - parseTest(#"\113"#, backreference(.absolute(113)), throwsError: .invalid) - parseTest(#"\377"#, backreference(.absolute(377)), throwsError: .invalid) - parseTest(#"\81"#, backreference(.absolute(81)), throwsError: .invalid) - - parseTest(#"\g1"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\g001"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\g52"#, backreference(.absolute(52)), throwsError: .invalid) - parseTest(#"\g-01"#, backreference(.relative(-1)), throwsError: .unsupported) - parseTest(#"\g+30"#, backreference(.relative(30)), throwsError: .unsupported) - - parseTest(#"\g{1}"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\g{001}"#, backreference(.absolute(1)), throwsError: .invalid) - parseTest(#"\g{52}"#, backreference(.absolute(52)), throwsError: .invalid) - parseTest(#"\g{-01}"#, backreference(.relative(-1)), throwsError: .unsupported) - parseTest(#"\g{+30}"#, backreference(.relative(30)), throwsError: .unsupported) - parseTest(#"\k<+4>"#, backreference(.relative(4)), throwsError: .unsupported) - parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) - parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) - parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\113"#, backreference(ref(113)), throwsError: .invalid) + parseTest(#"\377"#, backreference(ref(377)), throwsError: .invalid) + parseTest(#"\81"#, backreference(ref(81)), throwsError: .invalid) + + parseTest(#"\g1"#, backreference(ref(1)), throwsError: .invalid) + parseTest(#"\g001"#, backreference(ref(1)), throwsError: .invalid) + parseTest(#"\g52"#, backreference(ref(52)), throwsError: .invalid) + parseTest(#"\g-01"#, backreference(ref(minus: 1)), throwsError: .unsupported) + parseTest(#"\g+30"#, backreference(ref(plus: 30)), throwsError: .unsupported) + + parseTest(#"\g{1}"#, backreference(ref(1)), throwsError: .invalid) + parseTest(#"\g{001}"#, backreference(ref(1)), throwsError: .invalid) + parseTest(#"\g{52}"#, backreference(ref(52)), throwsError: .invalid) + parseTest(#"\g{-01}"#, backreference(ref(minus: 1)), throwsError: .unsupported) + parseTest(#"\g{+30}"#, backreference(ref(plus: 30)), throwsError: .unsupported) + parseTest(#"\k<+4>"#, backreference(ref(plus: 4)), throwsError: .unsupported) + parseTest(#"\k<2>"#, backreference(ref(2)), throwsError: .invalid) + parseTest(#"\k'-3'"#, backreference(ref(minus: 3)), throwsError: .unsupported) + parseTest(#"\k'1'"#, backreference(ref(1)), throwsError: .invalid) parseTest( #"(?)\k"#, concat( @@ -1315,18 +1315,18 @@ extension RegexTests { // Oniguruma recursion levels. parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k<1+1>"#, backreference(ref(1), recursionLevel: 1), throwsError: .unsupported) + parseTest(#"\k<3-8>"#, backreference(ref(3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'-3-8'"#, backreference(ref(minus: 3), recursionLevel: -8), throwsError: .unsupported) parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) - parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) - parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) + parseTest(#"\k'+3-8'"#, backreference(ref(plus: 3), recursionLevel: -8), throwsError: .unsupported) + parseTest(#"\k'+3+8'"#, backreference(ref(plus: 3), recursionLevel: 8), throwsError: .unsupported) + + parseTest(#"(?R)"#, subpattern(ref(0)), throwsError: .unsupported) + parseTest(#"(?0)"#, subpattern(ref(0)), throwsError: .unsupported) + parseTest(#"(?1)"#, subpattern(ref(1)), throwsError: .unsupported) + parseTest(#"(?+12)"#, subpattern(ref(plus: 12)), throwsError: .unsupported) + parseTest(#"(?-2)"#, subpattern(ref(minus: 2)), throwsError: .unsupported) parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) @@ -1334,25 +1334,25 @@ extension RegexTests { parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) - parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) - parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) + parseTest(#"\g<1>"#, subpattern(ref(1)), throwsError: .unsupported) + parseTest(#"\g<001>"#, subpattern(ref(1)), throwsError: .unsupported) + parseTest(#"\g'52'"#, subpattern(ref(52)), throwsError: .unsupported) + parseTest(#"\g'-01'"#, subpattern(ref(minus: 1)), throwsError: .unsupported) + parseTest(#"\g'+30'"#, subpattern(ref(plus: 30)), throwsError: .unsupported) parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) // These are valid references. parseTest(#"()\1"#, concat( - capture(empty()), backreference(.absolute(1)) + capture(empty()), backreference(ref(1)) ), captures: [.cap]) parseTest(#"\1()"#, concat( - backreference(.absolute(1)), capture(empty()) + backreference(ref(1)), capture(empty()) ), captures: [.cap]) parseTest(#"()()\2"#, concat( - capture(empty()), capture(empty()), backreference(.absolute(2)) + capture(empty()), capture(empty()), backreference(ref(2)) ), captures: [.cap, .cap]) parseTest(#"()\2()"#, concat( - capture(empty()), backreference(.absolute(2)), capture(empty()) + capture(empty()), backreference(ref(2)), capture(empty()) ), captures: [.cap, .cap]) // MARK: Character names. @@ -1652,13 +1652,13 @@ extension RegexTests { // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) - parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) - parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) - parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) + parseTest(#"(?C)"#, pcreCallout(number: 0), throwsError: .unsupported) + parseTest(#"(?C0)"#, pcreCallout(number: 0), throwsError: .unsupported) + parseTest(#"(?C20)"#, pcreCallout(number: 20), throwsError: .unsupported) + parseTest("(?C{abc})", pcreCallout(string: "abc"), throwsError: .unsupported) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), + parseTest("(?C\(delim)hello\(delim))", pcreCallout(string: "hello"), throwsError: .unsupported) } @@ -1737,7 +1737,7 @@ extension RegexTests { parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, - .limitDepth(.init(faking: 3)) + .limitDepth(.init(3, at: .fake)) ), throwsError: .unsupported) parseTest( @@ -1763,8 +1763,8 @@ extension RegexTests { (*NO_START_OPT)(*UTF)(*UCP)a """, ast("a", opts: - .limitDepth(.init(faking: 3)), .limitHeap(.init(faking: 1)), - .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, + .limitDepth(.init(3, at: .fake)), .limitHeap(.init(1, at: .fake)), + .limitMatch(.init(2, at: .fake)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties ), throwsError: .unsupported @@ -2323,17 +2323,17 @@ extension RegexTests { parseWithDelimitersTest( #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), + #"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1), throwsError: .unsupported ) parseWithDelimitersTest( #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) + #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), throwsError: .unsupported) parseWithDelimitersTest( - #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)), + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#), throwsError: .unsupported) // Fine, because we don't end up skipping. @@ -2369,6 +2369,8 @@ extension RegexTests { parseNotEqualTest(#"abc"#, #"abd"#) parseNotEqualTest(#" "#, #""#) + parseNotEqualTest(#"a{2}"#, #"a{3}"#) + parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) parseNotEqualTest(#"\u{A}"#, #"\u{B}"#) From 2ec86db82d17baa991cdcbfcc00fe3d01b27f149 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:01 +0100 Subject: [PATCH 3/6] Introduce Diagnostics --- Sources/_RegexParser/Regex/AST/AST.swift | 17 ++- .../Regex/Parse/Diagnostics.swift | 123 +++++++++++++++++- Sources/_RegexParser/Regex/Parse/Parse.swift | 3 +- .../Regex/Printing/PrintAsCanonical.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 2 +- Sources/_StringProcessing/Regex/Core.swift | 3 +- Sources/_StringProcessing/Regex/DSLTree.swift | 10 -- .../Utility/ASTBuilder.swift | 2 +- Tests/RegexTests/ParseTests.swift | 5 +- 9 files changed, 147 insertions(+), 20 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index 97051db10..43bb460c3 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -15,16 +15,31 @@ public struct AST: Hashable { public var root: AST.Node public var globalOptions: GlobalMatchingOptionSequence? + public var diags: Diagnostics - public init(_ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?) { + public init( + _ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?, + diags: Diagnostics + ) { self.root = root self.globalOptions = globalOptions + self.diags = diags } } extension AST { /// Whether this AST tree contains at least one capture nested inside of it. public var hasCapture: Bool { root.hasCapture } + + /// Whether this AST tree is either syntactically or semantically invalid. + public var isInvalid: Bool { diags.hasAnyError } + + /// If the AST is invalid, throws an error. Otherwise, returns self. + @discardableResult + public func ensureValid() throws -> AST { + try diags.throwAnyError() + return self + } } extension AST { diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index abb4afe56..ad571b5fd 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -242,9 +242,128 @@ extension ParseError: CustomStringConvertible { } } -// TODO: Fixits, notes, etc. +/// A fatal error that indicates broken logic in the parser. +enum FatalParseError: Hashable, Error { + case unreachable(String) +} + +extension FatalParseError: CustomStringConvertible { + var description: String { + switch self { + case .unreachable(let str): + return "UNREACHABLE: \(str)" + } + } +} + +// MARK: Diagnostic handling + +/// A diagnostic to emit. +public struct Diagnostic: Hashable { + public let behavior: Behavior + public let message: String + public let location: SourceLocation + + // TODO: Fixits, notes, etc. + + // The underlying ParseError if applicable. This is used for testing. + internal let underlyingParseError: ParseError? + + init(_ behavior: Behavior, _ message: String, at loc: SourceLocation, + underlyingParseError: ParseError? = nil) { + self.behavior = behavior + self.message = message + self.location = loc + self.underlyingParseError = underlyingParseError + } + + public var isAnyError: Bool { behavior.isAnyError } +} + +extension Diagnostic { + public enum Behavior: Hashable { + case fatalError, error, warning + + public var isAnyError: Bool { + switch self { + case .fatalError, .error: + return true + case .warning: + return false + } + } + } +} -// TODO: Diagnostics engine, recorder, logger, or similar. +/// A collection of diagnostics to emit. +public struct Diagnostics: Hashable { + public private(set) var diags = [Diagnostic]() + public init() {} + public init(_ diags: [Diagnostic]) { + self.diags = diags + } + /// Add a new diagnostic to emit. + public mutating func append(_ diag: Diagnostic) { + diags.append(diag) + } + /// Add all the diagnostics of another diagnostic collection. + public mutating func append(contentsOf other: Diagnostics) { + diags.append(contentsOf: other.diags) + } + + /// Add all the new fatal error diagnostics of another diagnostic collection. + /// This assumes that `other` was the same as `self`, but may have additional + /// diagnostics added to it. + public mutating func appendNewFatalErrors(from other: Diagnostics) { + let newDiags = other.diags.dropFirst(diags.count) + for diag in newDiags where diag.behavior == .fatalError { + append(diag) + } + } + + /// Whether any error is present. This includes fatal errors. + public var hasAnyError: Bool { + diags.contains(where: { $0.isAnyError }) + } + + /// Whether any fatal error is present. + public var hasFatalError: Bool { + diags.contains(where: { $0.behavior == .fatalError }) + } + + /// If any error diagnostic has been added, throw it as an Error. + func throwAnyError() throws { + for diag in diags where diag.isAnyError { + struct ErrorDiagnostic: Error, CustomStringConvertible { + var diag: Diagnostic + var description: String { diag.message } + } + throw ErrorDiagnostic(diag: diag) + } + } +} + +// MARK: Diagnostic construction + +extension Diagnostic { + init(_ err: ParseError, at loc: SourceLocation) { + self.init(.error, "\(err)", at: loc, underlyingParseError: err) + } + + init(_ err: FatalParseError, at loc: SourceLocation) { + self.init(.fatalError, "\(err)", at: loc) + } +} + +extension Diagnostics { + mutating func error(_ err: ParseError, at loc: SourceLocation) { + append(Diagnostic(err, at: loc)) + } + + mutating func fatal(_ err: FatalParseError, at loc: SourceLocation) { + append(Diagnostic(err, at: loc)) + } +} diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 389242614..05a618f59 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -165,7 +165,8 @@ extension Parser { } fatalError("Unhandled termination condition") } - return .init(ast, globalOptions: opts) + // TODO: Record and store diagnostics on the AST. + return .init(ast, globalOptions: opts, diags: Diagnostics()) } /// Parse a regular expression node. This should be used instead of `parse()` diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index be7b98991..0e7cfb1d3 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -32,7 +32,7 @@ extension AST.Node { showDelimiters delimiters: Bool = false, terminateLine: Bool = false ) -> String { - AST(self, globalOptions: nil).renderAsCanonical( + AST(self, globalOptions: nil, diags: Diagnostics()).renderAsCanonical( showDelimiters: delimiters, terminateLine: terminateLine) } } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 049a91ce0..a0cc11d01 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -56,7 +56,7 @@ extension PrettyPrinter { mutating func printBackoff(_ node: DSLTree.Node) { precondition(node.astNode != nil, "unconverted node") printAsCanonical( - .init(node.astNode!, globalOptions: nil), + .init(node.astNode!, globalOptions: nil, diags: Diagnostics()), delimiters: true) } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 98ba1a4f3..b27095f3f 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -37,7 +37,8 @@ public struct Regex: RegexComponent { self.program = Program(ast: ast) } init(ast: AST.Node) { - self.program = Program(ast: .init(ast, globalOptions: nil)) + self.program = Program(ast: + .init(ast, globalOptions: nil, diags: Diagnostics())) } // Compiler interface. Do not change independently. diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index e214f92f1..196927803 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -381,16 +381,6 @@ extension DSLTree { } } -extension DSLTree { - var ast: AST? { - guard let root = root.astNode else { - return nil - } - // TODO: Options mapping - return AST(root, globalOptions: nil) - } -} - extension DSLTree { var hasCapture: Bool { root.hasCapture diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index c3f3423ef..49f9e9b11 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -48,7 +48,7 @@ func empty() -> AST.Node { } func ast(_ root: AST.Node, opts: [AST.GlobalMatchingOption.Kind]) -> AST { - .init(root, globalOptions: .init(opts.map { .init($0, .fake) })) + .init(root, globalOptions: .init(opts.map { .init($0, .fake) }), diags: Diagnostics()) } func ast(_ root: AST.Node, opts: AST.GlobalMatchingOption.Kind...) -> AST { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 6b31342eb..608d55978 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -48,8 +48,9 @@ func parseTest( line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, - syntax: syntax, captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil, diags: Diagnostics()), + throwsError: errorKind, syntax: syntax, captures: expectedCaptures, + file: file, line: line ) } From 77a889e3b658e0f38d036f3e6a6b779ba3a8163a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:02 +0100 Subject: [PATCH 4/6] Recover from parser errors Currently we use Swift error handling for parser errors. While this is convenient, it has a number of drawbacks: - Any AST parsed gets thrown away as soon as we encounter an error. This prevents clients from being able to get any useful information from invalid AST (rdar://93677069). - Multiple diagnostics cannot be issued, meaning that e.g a basic syntactic error could obscure a more useful semantic error. - It doesn't extend nicely to e.g warning diagnostics, meaning that we'd eventually end up with 2 ways of emitting diagnostics. - The thrown errors relied on `recordLoc` blocks to annotate them with source location info, which could lead to errors without location info if we forgot to add the appropriate `recordLoc` calls. Additionally, in some cases we want a more fine grained location info than the block would give us. Therefore this commit removes the use of Swift error handling throughout the parser. The parser is now a total function that _always_ returns an AST. If errors are encountered while parsing, they are recorded, and are attached to the resulting AST by the parser. The parser attempts to recover as much of the AST it can when encountering an error. As such, there is now are now `.invalid` atom and character property kinds. Sema then runs and can attach more diagnostics onto the AST. For now, the compiler interface remains the same, and we pick a single error to `throw`, but this will be changed in a later PR to allow multiple errors and warnings, as well as AST recovery. This also means we can better preserve the capture type in the presence of parser errors. Fortunately, in most cases, this is quite a mechanical transformation. It entails: - Moving the lexical analysis methods onto the `Parser`. We were already passing `ParsingContext` parameters for most of them, so it's not clear they were benefitting from the isolation that `Source` offered. Effectively this means that all parsing has access to the context and diagnostics. - Converting error throwing statements into calls to the parser's `error` method (or `unreachable` method for unreachables). This commit also updates the parser tests to be able to be able to match against multiple diagnostics. --- Sources/_RegexParser/Regex/AST/Atom.swift | 15 +- .../CharacterPropertyClassification.swift | 103 +- .../Regex/Parse/Diagnostics.swift | 2 +- .../Regex/Parse/LexicalAnalysis.swift | 1799 +++++++++-------- Sources/_RegexParser/Regex/Parse/Parse.swift | 296 +-- Sources/_RegexParser/Regex/Parse/Sema.swift | 257 ++- Sources/_RegexParser/Regex/Parse/Source.swift | 95 +- .../Regex/Parse/SourceLocation.swift | 10 + .../_RegexParser/Regex/Printing/DumpAST.swift | 3 + .../_StringProcessing/ConsumerInterface.swift | 5 +- .../_StringProcessing/PrintAsPattern.swift | 4 + Tests/RegexTests/LexTests.swift | 52 +- Tests/RegexTests/ParseTests.swift | 1041 ++++++---- 13 files changed, 1966 insertions(+), 1716 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 2d7ed61cc..f1419ad78 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -80,6 +80,9 @@ extension AST { // (?i), (?i-m), ... case changeMatchingOptions(MatchingOptionSequence) + + // An invalid atom created by a parse error. + case invalid } } } @@ -104,6 +107,7 @@ extension AST.Atom { case .any: return nil case .startOfLine: return nil case .endOfLine: return nil + case .invalid: return nil } } @@ -465,6 +469,9 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by Java. case javaSpecial(JavaSpecial) + /// An invalid property that has been diagnosed by the parser. + case invalid(key: String?, value: String) + public enum MapKind: Hashable { case lowercase case uppercase @@ -801,7 +808,7 @@ extension AST.Atom { case .scalarSequence, .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions: + .changeMatchingOptions, .invalid: return nil } } @@ -815,6 +822,10 @@ extension AST.Atom { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: return true + case .scalarSequence: + // Unsupported for now (and we will diagnose as such), but treat it as a + // valid range operand for better recovery. + return true default: return false } @@ -849,7 +860,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective, .changeMatchingOptions: + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index fb122e027..bd635c83f 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -9,12 +9,12 @@ // //===----------------------------------------------------------------------===// -extension Source { +extension Parser { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, requireInPrefix: Bool = false, match: (String) throws -> T? - ) rethrows -> T? { + _ str: String, requireInPrefix: Bool = false, match: (String) -> T? + ) -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. @@ -22,12 +22,12 @@ extension Source { .lowercased() if requireInPrefix { guard str.hasPrefix("in") else { return nil } - return try match(String(str.dropFirst(2))) + return match(String(str.dropFirst(2))) } - if let m = try match(str) { + if let m = match(str) { return m } - if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) { + if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) { return m } return nil @@ -736,31 +736,40 @@ extension Source { return (major, minor) } - static func classifyCharacterPropertyValueOnly( - _ value: String - ) throws -> PropertyKind { - guard !value.isEmpty else { throw ParseError.emptyProperty } + mutating func classifyCharacterPropertyValueOnly( + _ valueLoc: Located + ) -> PropertyKind { + let value = valueLoc.value + + func error(_ err: ParseError) -> PropertyKind { + self.error(err, at: valueLoc.location) + return .invalid(key: nil, value: value) + } + + guard !value.isEmpty else { + return error(.emptyProperty) + } // Some special cases defined by UTS#18 (and Oniguruma for 'ANY' and // 'Assigned'). - if let specialProp = classifySpecialPropValue(value) { + if let specialProp = Self.classifySpecialPropValue(value) { return specialProp } // The following properties we can infer keys/values for. - if let prop = classifyBoolProperty(value) { + if let prop = Self.classifyBoolProperty(value) { return .binary(prop, value: true) } - if let cat = classifyGeneralCategory(value) { + if let cat = Self.classifyGeneralCategory(value) { return .generalCategory(cat) } - if let script = classifyScriptProperty(value) { + if let script = Self.classifyScriptProperty(value) { return .scriptExtension(script) } - if let posix = classifyPOSIX(value) { + if let posix = Self.classifyPOSIX(value) { return .posix(posix) } - if let block = classifyBlockProperty(value, valueOnly: true) { + if let block = Self.classifyBlockProperty(value, valueOnly: true) { return .block(block) } @@ -776,53 +785,67 @@ extension Source { // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? - throw ParseError.unknownProperty(key: nil, value: value) + return error(.unknownProperty(key: nil, value: value)) } - static func classifyCharacterProperty( - key: String, value: String - ) throws -> PropertyKind { - guard !key.isEmpty && !value.isEmpty else { throw ParseError.emptyProperty } + mutating func classifyCharacterProperty( + key keyLoc: Located, value valueLoc: Located + ) -> PropertyKind { + let key = keyLoc.value + let value = valueLoc.value + + func valueError(_ err: ParseError) -> PropertyKind { + error(err, at: valueLoc.location) + return .invalid(key: key, value: value) + } + + guard !key.isEmpty else { + error(.emptyProperty, at: keyLoc.location) + return .invalid(key: key, value: value) + } + guard !value.isEmpty else { + return valueError(.emptyProperty) + } - if let prop = classifyBoolProperty(key), - let isTrue = classifyCharacterPropertyBoolValue(value) { + if let prop = Self.classifyBoolProperty(key), + let isTrue = Self.classifyCharacterPropertyBoolValue(value) { return .binary(prop, value: isTrue) } // This uses the aliases defined in // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt. - let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in + let match = Self.withNormalizedForms(key) { normalizedKey -> PropertyKind? in switch normalizedKey { case "script", "sc": - guard let script = classifyScriptProperty(value) else { - throw ParseError.unrecognizedScript(value) + guard let script = Self.classifyScriptProperty(value) else { + return valueError(.unrecognizedScript(value)) } return .script(script) case "scriptextensions", "scx": - guard let script = classifyScriptProperty(value) else { - throw ParseError.unrecognizedScript(value) + guard let script = Self.classifyScriptProperty(value) else { + return valueError(.unrecognizedScript(value)) } return .scriptExtension(script) case "gc", "generalcategory": - guard let cat = classifyGeneralCategory(value) else { - throw ParseError.unrecognizedCategory(value) + guard let cat = Self.classifyGeneralCategory(value) else { + return valueError(.unrecognizedCategory(value)) } return .generalCategory(cat) case "age": - guard let (major, minor) = parseAge(value) else { - throw ParseError.invalidAge(value) + guard let (major, minor) = Self.parseAge(value) else { + return valueError(.invalidAge(value)) } return .age(major: major, minor: minor) case "name", "na": return .named(value) case "numericvalue", "nv": guard let numericValue = Double(value) else { - throw ParseError.invalidNumericValue(value) + return valueError(.invalidNumericValue(value)) } return .numericValue(numericValue) case "numerictype", "nt": - guard let type = classifyNumericType(value) else { - throw ParseError.unrecognizedNumericType(value) + guard let type = Self.classifyNumericType(value) else { + return valueError(.unrecognizedNumericType(value)) } return .numericType(type) case "slc", "simplelowercasemapping": @@ -833,13 +856,13 @@ extension Source { return .mapping(.titlecase, value) case "ccc", "canonicalcombiningclass": guard let cccValue = UInt8(value), cccValue <= 254 else { - throw ParseError.invalidCCC(value) + return valueError(.invalidCCC(value)) } return .ccc(.init(rawValue: cccValue)) case "blk", "block": - guard let block = classifyBlockProperty(value, valueOnly: false) else { - throw ParseError.unrecognizedBlock(value) + guard let block = Self.classifyBlockProperty(value, valueOnly: false) else { + return valueError(.unrecognizedBlock(value)) } return .block(block) default: @@ -852,6 +875,8 @@ extension Source { } // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? - throw ParseError.unknownProperty(key: key, value: value) + error(.unknownProperty(key: key, value: value), + at: keyLoc.location.union(with: valueLoc.location)) + return .invalid(key: key, value: value) } } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index ad571b5fd..a23e0aed1 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -341,7 +341,7 @@ public struct Diagnostics: Hashable { var diag: Diagnostic var description: String { diag.message } } - throw ErrorDiagnostic(diag: diag) + throw Source.LocatedError(ErrorDiagnostic(diag: diag), diag.location) } } } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 691d8fa4a..05f066ff6 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -16,178 +16,244 @@ concerns upon request. API convention: -- lexFoo will try to consume a foo and return it if successful, throws errors -- expectFoo will consume a foo, throwing errors, and throw an error if it can't -- eat() and tryEat() is still used by the parser as a character-by-character interface +- lexFoo will try to consume a foo and return it if successful, otherwise returns nil +- expectFoo will consume a foo, diagnosing an error if unsuccessful */ -extension Error { - func addingLocation(_ loc: Range) -> Error { - // If we're already a LocatedError, don't change the location. - if self is LocatedErrorProtocol { - return self - } - return Source.LocatedError(self, loc) - } -} +extension Parser { + typealias Located = Source.Located + typealias Location = Source.Location + typealias LocatedError = Source.LocatedError + typealias Char = Source.Char -extension Source { // MARK: - recordLoc - /// Record source loc before processing and return - /// or throw the value/error with source locations. + /// Attach a source location to the parsed contents of a given function. fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> T - ) rethrows -> Located { - let start = currentPosition - do { - let result = try f(&self) - return Located(result, Location(start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, Location(start.. T + ) -> Located { + let start = src.currentPosition + let result = f(&self) + return Located(result, loc(start)) + } + + /// Attach a source location to the parsed contents of a given function. fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> T? - ) rethrows -> Located? { - let start = currentPosition - do { - guard let result = try f(&self) else { return nil } - return Located(result, start.. T? + ) -> Located? { + let start = src.currentPosition + guard let result = f(&self) else { return nil } + return Located(result, loc(start)) } - /// Record source loc before processing and return - /// or throw the value/error with source locations. + /// Attach a source location to the parsed contents of a given function. @discardableResult fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> () - ) rethrows -> SourceLocation { - let start = currentPosition - do { - try f(&self) - return SourceLocation(start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start.. () + ) -> SourceLocation { + let start = src.currentPosition + f(&self) + return loc(start) + } +} + +// MARK: Backtracking routines + +extension Parser { + /// Attempt to make a series of lexing steps in `body`, returning `nil` if + /// unsuccesful, which will revert the parser back to its previous state. + mutating func tryEating( + _ body: (inout Self) -> T? + ) -> T? { + var current = self + guard let result = body(&self) else { + // Fatal errors are always preserved. + current.diags.appendNewFatalErrors(from: diags) + self = current + return nil } + return result + } + + /// Perform a lookahead using a temporary source. Within the body of the + /// lookahead, any modifications to the source will not be reflected outside + /// the body. + mutating func lookahead(_ body: (inout Self) -> T) -> T { + var p = self + let result = body(&p) + // Fatal errors are always preserved. + diags.appendNewFatalErrors(from: p.diags) + return result } } // MARK: - Consumption routines -extension Source { +extension Parser { typealias Quant = AST.Quantification - /// Throws an expected character error if not matched + /// Expect to eat a given character, diagnosing an error and returning + /// `false` if unsuccessful, `true` otherwise. @discardableResult - mutating func expect(_ c: Character) throws -> SourceLocation { - try recordLoc { src in - guard src.tryEat(c) else { - throw ParseError.expected(String(c)) - } + mutating func expect(_ c: Character) -> Bool { + guard tryEat(c) else { + errorAtCurrentPosition(.expected(String(c))) + return false + } + return true + } + + /// Same as `expect`, but with a source location. + mutating func expectWithLoc(_ c: Character) -> Located { + recordLoc { + $0.expect(c) } } - /// Throws an expected character error if not matched + /// Expect to eat a sequence of characters, diagnosing an error and returning + /// `false` if unsuccessful, `true` otherwise. + @discardableResult mutating func expect( sequence c: C - ) throws where C.Element == Character { - _ = try recordLoc { src in - guard src.tryEat(sequence: c) else { - throw ParseError.expected(String(c)) - } + ) -> Bool where C.Element == Character { + guard tryEat(sequence: c) else { + errorAtCurrentPosition(.expected(String(c))) + return false } + return true } - /// Throws an unexpected end of input error if not matched - /// - /// Note: much of the time, but not always, we can vend a more specific error. + /// Diagnoses an error and returns `false` if the end of input has been + /// reached. Otherwise returns `true`. + @discardableResult mutating func expectNonEmpty( _ error: ParseError = .unexpectedEndOfInput - ) throws { - _ = try recordLoc { src in - if src.isEmpty { throw error } + ) -> Bool { + guard !src.isEmpty else { + errorAtCurrentPosition(error) + return false } + return true } - mutating func tryEatNonEmpty(sequence c: C) throws -> Bool - where C.Element == Char - { - try expectNonEmpty(.expected(String(c))) - return tryEat(sequence: c) + /// Attempt to eat a sequence of characters, additionally diagnosing if the + /// end of the source has been reached. + mutating func tryEatNonEmpty( + sequence c: C + ) -> Bool where C.Element == Char { + expectNonEmpty(.expected(String(c))) && tryEat(sequence: c) } - mutating func tryEatNonEmpty(_ c: Char) throws -> Bool { - try tryEatNonEmpty(sequence: String(c)) + /// Returns the next character, or `nil` if the end of the source has been + /// reached. + func peek() -> Char? { src.peek() } + + /// Same as `peek()`, but with the source location of the next character. + func peekWithLoc() -> Located? { + peek().map { c in + let nextPos = src.input.index(after: src.currentPosition) + return Located(c, Location(src.currentPosition ..< nextPos)) + } } - /// Attempt to make a series of lexing steps in `body`, returning `nil` if - /// unsuccesful, which will revert the source back to its previous state. If - /// an error is thrown, the source will not be reverted. - mutating func tryEating( - _ body: (inout Source) throws -> T? - ) rethrows -> T? { - // We don't revert the source if an error is thrown, as it's useful to - // maintain the source location in that case. - let current = self - guard let result = try body(&self) else { - self = current - return nil + /// Advance the input `n` characters ahead. + mutating func advance(_ n: Int = 1) { + guard src.tryAdvance(n) else { + unreachable("Advancing beyond end!") + + // Empty out the remaining characters. + src.tryAdvance(src._slice.count) + return } - return result } - /// Perform a lookahead using a temporary source. Within the body of the - /// lookahead, any modifications to the source will not be reflected outside - /// the body. - func lookahead(_ body: (inout Source) throws -> T) rethrows -> T { - var src = self - return try body(&src) + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard let char = peek() else { return nil } + advance() + return char + } + + /// Same as `tryEat()`, but with the source location of the eaten character. + mutating func tryEatWithLoc() -> Located? { + recordLoc { $0.tryEat() } + } + + /// Attempt to eat the given character, returning `true` if successful, + /// `false` otherwise. + mutating func tryEat(_ c: Char) -> Bool { + guard peek() == c else { return false } + advance() + return true } /// Attempt to eat the given character, returning its source location if /// successful, `nil` otherwise. mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { - let start = currentPosition + let start = src.currentPosition guard tryEat(c) else { return nil } - return .init(start ..< currentPosition) + return .init(start ..< src.currentPosition) + } + + /// Attempt to eat a character if it matches a given predicate, returning + /// `true` if the character was eaten, or `false` if the character did not + /// meet the predicate. + mutating func tryEat(where pred: (Char) -> Bool) -> Bool { + guard let next = peek(), pred(next) else { return false } + advance() + return true + } + + /// Attempt to eat a sequence of characters, returning `true` if successful. + mutating func tryEat( + sequence c: C + ) -> Bool where C.Element == Char { + guard src.starts(with: c) else { return false } + advance(c.count) + return true + } + + /// Attempt to eat any of the given characters, returning the one that was + /// eaten. + mutating func tryEat( + anyOf set: C + ) -> Char? where C.Element == Char { + guard let c = peek(), set.contains(c) else { return nil } + advance() + return c + } + + /// Attempt to eat any of the given characters, returning the one that was + /// eaten. + mutating func tryEat(anyOf set: Char...) -> Char? { + tryEat(anyOf: set) + } + + /// Eat up to `count` characters, returning the range of characters eaten. + mutating func eat(upToCount count: Int) -> Located { + recordLoc { $0.src.eat(upToCount: count).string } } /// Attempt to eat a given prefix that satisfies a given predicate, with the /// source location recorded. - mutating func tryEatLocatedPrefix( + mutating func tryEatPrefix( maxLength: Int? = nil, _ f: (Char) -> Bool ) -> Located? { - let result = recordLoc { src in - src.tryEatPrefix(maxLength: maxLength, f) - } - guard let result = result else { return nil } - return result.map(\.string) + recordLoc { $0.src.tryEatPrefix(maxLength: maxLength, f)?.string } } - /// Throws an expected ASCII character error if not matched - mutating func expectASCII() throws -> Located { - try recordLoc { src in - guard let c = src.peek() else { - throw ParseError.unexpectedEndOfInput + /// Attempts to eat an ASCII value, diagnosing an error and returning `nil` + /// if unsuccessful. + mutating func expectASCII() -> Located? { + recordLoc { p in + guard let c = p.tryEat() else { + p.errorAtCurrentPosition(.unexpectedEndOfInput) + return nil } guard c.isASCII else { - throw ParseError.expectedASCII(c) + p.errorAtCurrentPosition(.expectedASCII(c)) + return nil } - src.eat(asserting: c) return c } } @@ -218,31 +284,43 @@ enum IdentifierKind { case onigurumaCalloutTag } -extension Source { +extension Parser { /// Validate a string of digits as a particular radix, and return the number, - /// or throw an error if the string is malformed or would overflow the number - /// type. - private static func validateNumber( - _ str: String, _: Num.Type, _ kind: RadixKind - ) throws -> Num { + /// or diagnose an error if the string is malformed or would overflow the + /// number type. + private mutating func validateNumber( + _ locStr: Located, _: Num.Type, _ kind: RadixKind + ) -> Num? { + let str = locStr.value guard !str.isEmpty && str.all(kind.characterFilter) else { - throw ParseError.expectedNumber(str, kind: kind) + error(.expectedNumber(str, kind: kind), at: locStr.location) + return nil } guard let i = Num(str, radix: kind.radix) else { - throw ParseError.numberOverflow(str) + error(.numberOverflow(str), at: locStr.location) + return nil } return i } /// Validate a string of digits as a unicode scalar of a particular radix, and - /// return the scalar value, or throw an error if the string is malformed or - /// would overflow the scalar. - private static func validateUnicodeScalar( + /// return the scalar value, or diagnose an error if the string is malformed + /// or would overflow the scalar. + private mutating func validateUnicodeScalar( _ str: Source.Located, _ kind: RadixKind - ) throws -> AST.Atom.Scalar { - let num = try validateNumber(str.value, UInt32.self, kind) + ) -> AST.Atom.Scalar { + func nullScalar() -> AST.Atom.Scalar { + // For now, return a null scalar in the case of an error. This should be + // benign as it shouldn't affect other validation logic. + // TODO: Should we store nil like we do with regular numbers? + return .init(UnicodeScalar(0), str.location) + } + guard let num = validateNumber(str, UInt32.self, kind) else { + return nullScalar() + } guard let scalar = Unicode.Scalar(num) else { - throw ParseError.misc("Invalid scalar value U+\(num.hexStr)") + error(.misc("Invalid scalar value U+\(num.hexStr)"), at: str.location) + return nullScalar() } return .init(scalar, str.location) } @@ -251,51 +329,39 @@ extension Source { /// /// Returns: `nil` if there's no number, otherwise the number /// - /// Throws on overflow - /// - private mutating func lexNumber( - _ kind: RadixKind - ) throws -> AST.Atom.Number? { - try recordLoc { src in - guard let str = src.tryEatLocatedPrefix(kind.characterFilter) else { - return nil - } - guard let i = Int(str.value, radix: kind.radix) else { - throw ParseError.numberOverflow(str.value) - } - return .init(i, at: str.location) - }.value - } - - /// Try to eat a number off the front. - /// - /// Returns: `nil` if there's no number, otherwise the number - /// - /// Throws on overflow + /// Diagnoses on overflow /// - mutating func lexNumber() throws -> AST.Atom.Number? { - try lexNumber(.decimal) + mutating func lexNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number? { + guard let str = tryEatPrefix(kind.characterFilter) else { + return nil + } + guard let i = Int(str.value, radix: kind.radix) else { + error(.numberOverflow(str.value), at: str.location) + return .init(nil, at: str.location) + } + return .init(i, at: str.location) } - mutating func expectNumber() throws -> AST.Atom.Number { - guard let num = try lexNumber() else { - throw ParseError.expectedNumber("", kind: .decimal) + /// Expect a number of a given `kind`, diagnosing if a number cannot be + /// parsed. + mutating func expectNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number { + guard let num = lexNumber(kind) else { + errorAtCurrentPosition(.expectedNumber("", kind: kind)) + return .init(nil, at: loc(src.currentPosition)) } return num } /// Eat a scalar value from hexadecimal notation off the front - private mutating func expectUnicodeScalar( - numDigits: Int - ) throws -> AST.Atom.Scalar { - let str = try recordLoc { src -> String in - let str = src.eat(upToCount: numDigits).string - guard str.count == numDigits else { - throw ParseError.expectedNumDigits(str, numDigits) + mutating func expectUnicodeScalar(numDigits: Int) -> AST.Atom.Scalar { + let str = recordLoc { p -> String in + let str = p.eat(upToCount: numDigits) + if str.value.count != numDigits { + p.error(.expectedNumDigits(str.value, numDigits), at: str.location) } - return str + return str.value } - return try Source.validateUnicodeScalar(str, .hex) + return validateUnicodeScalar(str, .hex) } /// Try to lex a seqence of hex digit unicode scalars. @@ -305,41 +371,40 @@ extension Source { /// mutating func expectUnicodeScalarSequence( eating ending: Character - ) throws -> AST.Atom.Kind { - try recordLoc { src in - var scalars = [AST.Atom.Scalar]() - var trivia = [AST.Trivia]() - - // Eat up any leading whitespace. - if let t = src.lexWhitespace() { trivia.append(t) } - - while true { - let str = src.lexUntil { src in - // Hit the ending, stop lexing. - if src.isEmpty || src.peek() == ending { - return true - } - // Eat up trailing whitespace, and stop lexing to record the scalar. - if let t = src.lexWhitespace() { - trivia.append(t) - return true - } - // Not the ending or trivia, must be a digit of the scalar. - return false + ) -> AST.Atom.Kind { + var scalars = [AST.Atom.Scalar]() + var trivia = [AST.Trivia]() + + // Eat up any leading whitespace. + if let t = lexWhitespace() { trivia.append(t) } + + while true { + let str = lexUntil { p in + // Hit the ending, stop lexing. + if p.src.isEmpty || p.peek() == ending { + return true } - guard !str.value.isEmpty else { break } - scalars.append(try Source.validateUnicodeScalar(str, .hex)) - } - guard !scalars.isEmpty else { - throw ParseError.expectedNumber("", kind: .hex) + // Eat up trailing whitespace, and stop lexing to record the scalar. + if let t = p.lexWhitespace() { + trivia.append(t) + return true + } + // Not the ending or trivia, must be a digit of the scalar. + return false } - try src.expect(ending) + guard !str.value.isEmpty else { break } + scalars.append(validateUnicodeScalar(str, .hex)) + } + expect(ending) - if scalars.count == 1 { - return .scalar(scalars[0]) - } - return .scalarSequence(.init(scalars, trivia: trivia)) - }.value + if scalars.isEmpty { + errorAtCurrentPosition(.expectedNumber("", kind: .hex)) + return .scalar(.init(UnicodeScalar(0), loc(src.currentPosition))) + } + if scalars.count == 1 { + return .scalar(scalars[0]) + } + return .scalarSequence(.init(scalars, trivia: trivia)) } /// Try to eat a scalar off the front, starting from after the backslash and @@ -353,62 +418,59 @@ extension Source { /// | 'o{' OctalDigit{1...} '}' /// | '0' OctalDigit{0...3} /// - mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? { - try recordLoc { src in - try src.tryEating { src in + mutating func lexUnicodeScalar() -> AST.Atom.Kind? { + tryEating { p in - func nullScalar() -> AST.Atom.Kind { - let pos = src.currentPosition - return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) - } + func nullScalar() -> AST.Atom.Scalar { + .init(UnicodeScalar(0), p.loc(p.src.currentPosition)) + } - // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. - switch src.tryEat() { + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. + switch p.tryEat() { // Hex numbers. - case "u" where src.tryEat("{"): - return try src.expectUnicodeScalarSequence(eating: "}") + case "u" where p.tryEat("{"): + return p.expectUnicodeScalarSequence(eating: "}") - case "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) + case "x" where p.tryEat("{"): + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .hex)) - case "x": - // \x expects *up to* 2 digits. - guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) - else { - // In PCRE, \x without any valid hex digits is \u{0}. - // TODO: This doesn't appear to be followed by ICU or Oniguruma, so - // could be changed to throw an error if we had a parsing mode for - // them. - return nullScalar() - } - return .scalar(try Source.validateUnicodeScalar(digits, .hex)) + case "x": + // \x expects *up to* 2 digits. + guard let digits = p.tryEatPrefix(maxLength: 2, \.isHexDigit) + else { + // In PCRE, \x without any valid hex digits is \u{0}. + // TODO: This doesn't appear to be followed by ICU or Oniguruma, so + // could be changed to diagnose an error if we had a parsing mode for + // them. + return .scalar(nullScalar()) + } + return .scalar(p.validateUnicodeScalar(digits, .hex)) - case "u": - return .scalar(try src.expectUnicodeScalar(numDigits: 4)) - case "U": - return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + case "u": + return .scalar(p.expectUnicodeScalar(numDigits: 4)) + case "U": + return .scalar(p.expectUnicodeScalar(numDigits: 8)) // Octal numbers. - case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .octal)) - - case "0": - // We can read *up to* 3 more octal digits. - // FIXME: PCRE can only read up to 2 octal digits, if we get a strict - // PCRE mode, we should limit it here. - guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) - else { - return nullScalar() - } - return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - - default: - return nil + case "o" where p.tryEat("{"): + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .octal)) + + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = p.tryEatPrefix(maxLength: 3, \.isOctalDigit) + else { + return .scalar(nullScalar()) } + return .scalar(p.validateUnicodeScalar(digits, .octal)) + + default: + return nil } - }.value + } } /// Try to consume a quantifier @@ -417,21 +479,20 @@ extension Source { /// QuantKind -> '?' | '+' /// mutating func lexQuantifier( - context: ParsingContext - ) throws -> (Located, Located, [AST.Trivia])? { + ) -> (Located, Located, [AST.Trivia])? { var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } + if let t = lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = try recordLoc { src in - if src.tryEat("*") { return .zeroOrMore } - if src.tryEat("+") { return .oneOrMore } - if src.tryEat("?") { return .zeroOrOne } + let amt: Located? = recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return try src.tryEating { src in - guard src.tryEat("{"), - let range = try src.lexRange(context: context, trivia: &trivia), - src.tryEat("}") + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") else { return nil } return range.value } @@ -439,11 +500,11 @@ extension Source { guard let amt = amt else { return nil } // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } + if let t = lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { src in - if src.tryEat("?") { return .reluctant } - if src.tryEat("+") { return .possessive } + let kind: Located = recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } return .eager } @@ -456,45 +517,40 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange( - context: ParsingContext, trivia: inout [AST.Trivia] - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - if let t = src.lexWhitespace() { trivia.append(t) } + mutating func lexRange(trivia: inout [AST.Trivia]) -> Located? { + recordLoc { p in + p.tryEating { p in + if let t = p.lexWhitespace() { trivia.append(t) } - let lowerOpt = try src.lexNumber() + let lowerOpt = p.lexNumber() - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } // ',' or '...' or '..<' or nothing - // TODO: We ought to try and consume whitespace here and emit a - // diagnostic for the user warning them that it would cause the range to - // be treated as literal. let closedRange: Bool? - if src.tryEat(",") { + if p.tryEat(",") { closedRange = true - } else if context.experimentalRanges && src.tryEat(".") { - try src.expect(".") - if src.tryEat(".") { + } else if p.context.experimentalRanges && p.tryEat(".") { + p.expect(".") + if p.tryEat(".") { closedRange = true } else { - try src.expect("<") + p.expect("<") closedRange = false } } else { closedRange = nil } - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } - var upperOpt = try src.lexNumber() + var upperOpt = p.lexNumber() if closedRange == false { // If we have an open range, the upper bound should be adjusted down. upperOpt?.value? -= 1 } - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } switch (lowerOpt, closedRange, upperOpt) { case let (l?, nil, nil): @@ -507,7 +563,8 @@ extension Source { return .range(l, u) case (nil, nil, _?): - fatalError("Didn't lex lower bound, but lexed upper bound?") + p.unreachable("Didn't lex lower bound, but lexed upper bound?") + return nil default: return nil } @@ -516,34 +573,31 @@ extension Source { } private mutating func lexUntil( - _ predicate: (inout Source) throws -> Bool - ) rethrows -> Located { - // We track locations outside of recordLoc, as the predicate may advance the - // input when we hit the end, and we don't want that to affect the location - // of what was lexed in the `result`. We still want the recordLoc call to - // attach locations to any thrown errors though. + _ predicate: (inout Self) -> Bool + ) -> Located { + // We track locations without using recordLoc, as the predicate may advance + // the input when we hit the end, and we don't want that to affect the + // location of what was lexed in the `result`. // TODO: We should find a better way of doing this, `lexUntil` seems full // of footguns. - let start = currentPosition - var end = currentPosition + let start = src.currentPosition + var end = src.currentPosition var result = "" - try recordLoc { src in - while try !predicate(&src) { - result.append(src.eat()) - end = src.currentPosition - } + while !predicate(&self), let c = tryEat() { + result.append(c) + end = src.currentPosition } return .init(result, start ..< end) } - private mutating func lexUntil(eating end: String) throws -> Located { - try lexUntil { try $0.tryEatNonEmpty(sequence: end) } + private mutating func lexUntil(eating end: String) -> Located { + lexUntil { $0.tryEatNonEmpty(sequence: end) } } private mutating func lexUntil( eating end: Character - ) throws -> Located { - try lexUntil(eating: String(end)) + ) -> Located { + lexUntil(eating: String(end)) } /// Expect a linear run of non-nested non-empty content ending with a given @@ -552,28 +606,28 @@ extension Source { private mutating func expectQuoted( endingWith endSingle: String, count: Int = 1, ignoreEscaped: Bool = false, eatEnding: Bool = true - ) throws -> Located { + ) -> Located { let end = String(repeating: endSingle, count: count) - let result = try recordLoc { src -> String in - try src.lexUntil { src in - if src.starts(with: end) { + let result = recordLoc { p -> String in + p.lexUntil { p in + if p.src.starts(with: end) { return true } - try src.expectNonEmpty(.expected(endSingle)) + guard p.expectNonEmpty(.expected(endSingle)) else { return true } // Ignore escapes if we're allowed to. lexUntil will consume the next // character. - if ignoreEscaped, src.tryEat("\\") { - try src.expectNonEmpty(.expectedEscape) + if ignoreEscaped, p.tryEat("\\") { + guard p.expectNonEmpty(.expectedEscape) else { return true } } return false }.value } - guard !result.value.isEmpty else { - throw ParseError.expectedNonEmptyContents + if result.value.isEmpty { + error(.expectedNonEmptyContents, at: result.location) } if eatEnding { - try expect(sequence: end) + expect(sequence: end) } return result } @@ -590,28 +644,28 @@ extension Source { /// /// TODO: Need to support some escapes /// - mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { - let str = try recordLoc { src -> String? in - if src.tryEat(sequence: #"\Q"#) { - let contents = src.lexUntil { src in - src.isEmpty || src.tryEat(sequence: #"\E"#) - }.value + mutating func lexQuote() -> AST.Quote? { + let str = recordLoc { p -> String? in + if p.tryEat(sequence: #"\Q"#) { + let contents = p.lexUntil { p in + p.src.isEmpty || p.tryEat(sequence: #"\E"#) + } // In multi-line literals, the quote may not span multiple lines. - if context.syntax.contains(.multilineCompilerLiteral), - contents.spansMultipleLinesInRegexLiteral { - throw ParseError.quoteMayNotSpanMultipleLines + if p.context.syntax.contains(.multilineCompilerLiteral), + contents.value.spansMultipleLinesInRegexLiteral { + p.error(.quoteMayNotSpanMultipleLines, at: contents.location) } // The sequence must not be empty in a custom character class. - if context.isInCustomCharacterClass && contents.isEmpty { - throw ParseError.expectedNonEmptyContents + if p.context.isInCustomCharacterClass && contents.value.isEmpty { + p.error(.expectedNonEmptyContents, at: contents.location) } - return contents + return contents.value } - if context.experimentalQuotes, src.tryEat("\"") { + if p.context.experimentalQuotes, p.tryEat("\"") { // TODO: Can experimental quotes be empty? - return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value + return p.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil } @@ -623,16 +677,13 @@ extension Source { /// /// Interpolation -> '<{' String '}>' /// - mutating func lexInterpolation() throws -> AST.Interpolation? { - let contents = try recordLoc { src -> String? in - try src.tryEating { src in - guard src.tryEat(sequence: "<{") else { return nil } - _ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") } - guard src.tryEat(sequence: "}>") else { return nil } - - // Not currently supported. We error here instead of during Sema to - // get a better error for something like `(<{)}>`. - throw ParseError.unsupported("interpolation") + mutating func lexInterpolation() -> AST.Interpolation? { + let contents = recordLoc { p -> String? in + p.tryEating { p in + guard p.tryEat(sequence: "<{") else { return nil } + let contents = p.lexUntil { $0.src.isEmpty || $0.src.starts(with: "}>") } + guard p.tryEat(sequence: "}>") else { return nil } + return contents.value } } guard let contents = contents else { return nil } @@ -653,34 +704,34 @@ extension Source { /// /// TODO: Swift-style nested comments, line-ending comments, etc /// - mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { - let trivia: Located? = try recordLoc { src in - if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") { - return try src.lexUntil(eating: ")").value + mutating func lexComment() -> AST.Trivia? { + let trivia: Located? = recordLoc { p in + if !p.context.isInCustomCharacterClass && p.tryEat(sequence: "(?#") { + return p.lexUntil(eating: ")").value } - if context.experimentalComments, src.tryEat(sequence: "/*") { - return try src.lexUntil(eating: "*/").value + if p.context.experimentalComments, p.tryEat(sequence: "/*") { + return p.lexUntil(eating: "*/").value } - if context.endOfLineComments, src.tryEat("#") { + if p.context.endOfLineComments, p.tryEat("#") { // Try eat until we either exhaust the input, or hit a newline. Note // that the definition of newline can be altered depending on the global // matching options. By default we consider a newline to be `\n` or // `\r`. - return src.lexUntil { src in - if src.isEmpty { return true } - switch context.newlineMode { + return p.lexUntil { p in + if p.src.isEmpty { return true } + switch p.context.newlineMode { case .carriageReturnOnly: - return src.tryEat("\r") + return p.tryEat("\r") case .linefeedOnly: - return src.tryEat("\n") + return p.tryEat("\n") case .carriageAndLinefeedOnly: - return src.tryEat("\r\n") + return p.tryEat("\r\n") case .anyCarriageReturnOrLinefeed: - return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil + return p.tryEat(anyOf: "\r", "\n", "\r\n") != nil case .anyUnicode: - return src.tryEat(where: \.isNewline) + return p.tryEat(where: \.isNewline) case .nulCharacter: - return src.tryEat("\0") + return p.tryEat("\0") } }.value } @@ -695,9 +746,7 @@ extension Source { /// Whitespace -> WhitespaceChar+ /// /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set - mutating func lexNonSemanticWhitespace( - context: ParsingContext - ) -> AST.Trivia? { + mutating func lexNonSemanticWhitespace() -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } // FIXME: PCRE only treats space and tab characters as whitespace when @@ -714,10 +763,7 @@ extension Source { /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex /// whitespace. mutating func lexWhitespace() -> AST.Trivia? { - let trivia: Located? = recordLoc { src in - src.tryEatPrefix(\.isPatternWhitespace)?.string - } - guard let trivia = trivia else { return nil } + guard let trivia = tryEatPrefix(\.isPatternWhitespace) else { return nil } return AST.Trivia(trivia) } @@ -725,11 +771,11 @@ extension Source { /// /// Trivia -> Comment | Whitespace /// - mutating func lexTrivia(context: ParsingContext) throws -> AST.Trivia? { - if let comment = try lexComment(context: context) { + mutating func lexTrivia() -> AST.Trivia? { + if let comment = lexComment() { return comment } - if let whitespace = lexNonSemanticWhitespace(context: context) { + if let whitespace = lexNonSemanticWhitespace() { return whitespace } return nil @@ -740,55 +786,51 @@ extension Source { /// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' /// | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' /// - mutating func lexMatchingOption() throws -> AST.MatchingOption? { + mutating func lexMatchingOption() -> AST.MatchingOption? { typealias OptKind = AST.MatchingOption.Kind - let locOpt = try recordLoc { src -> OptKind? in - func advanceAndReturn(_ o: OptKind) -> OptKind { - src.advance() - return o - } - guard let c = src.peek() else { return nil } - switch c { - // PCRE options. - case "i": return advanceAndReturn(.caseInsensitive) - case "J": return advanceAndReturn(.allowDuplicateGroupNames) - case "m": return advanceAndReturn(.multiline) - case "n": return advanceAndReturn(.namedCapturesOnly) - case "s": return advanceAndReturn(.singleLine) - case "U": return advanceAndReturn(.reluctantByDefault) - case "x": - src.advance() - return src.tryEat("x") ? .extraExtended : .extended - - // ICU options. - case "w": return advanceAndReturn(.unicodeWordBoundaries) - - // Oniguruma options. - case "D": return advanceAndReturn(.asciiOnlyDigit) - case "P": return advanceAndReturn(.asciiOnlyPOSIXProps) - case "S": return advanceAndReturn(.asciiOnlySpace) - case "W": return advanceAndReturn(.asciiOnlyWord) - case "y": - src.advance() - try src.expect("{") - let opt: OptKind - if src.tryEat("w") { - opt = .textSegmentWordMode - } else { - try src.expect("g") - opt = .textSegmentGraphemeMode + let locOpt = recordLoc { p -> OptKind? in + p.tryEating { p in + guard let c = p.tryEat() else { return nil } + switch c { + // PCRE options. + case "i": return .caseInsensitive + case "J": return .allowDuplicateGroupNames + case "m": return .multiline + case "n": return .namedCapturesOnly + case "s": return .singleLine + case "U": return .reluctantByDefault + case "x": + return p.tryEat("x") ? .extraExtended : .extended + + // ICU options. + case "w": return .unicodeWordBoundaries + + // Oniguruma options. + case "D": return .asciiOnlyDigit + case "P": return .asciiOnlyPOSIXProps + case "S": return .asciiOnlySpace + case "W": return .asciiOnlyWord + case "y": + p.expect("{") + let opt: OptKind + if p.tryEat("w") { + opt = .textSegmentWordMode + } else { + p.expect("g") + opt = .textSegmentGraphemeMode + } + p.expect("}") + return opt + + // Swift semantic level options + case "X": return .graphemeClusterSemantics + case "u": return .unicodeScalarSemantics + case "b": return .byteSemantics + + default: + return nil } - try src.expect("}") - return opt - - // Swift semantic level options - case "X": return advanceAndReturn(.graphemeClusterSemantics) - case "u": return advanceAndReturn(.unicodeScalarSemantics) - case "b": return advanceAndReturn(.byteSemantics) - - default: - return nil } } guard let locOpt = locOpt else { return nil } @@ -800,109 +842,98 @@ extension Source { /// MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+ /// | MatchingOption* '-' MatchingOption* /// - mutating func lexMatchingOptionSequence( - context: ParsingContext - ) throws -> AST.MatchingOptionSequence? { + mutating func lexMatchingOptionSequence() -> AST.MatchingOptionSequence? { // PCRE accepts '(?)' // TODO: This is a no-op, should we warn? if peek() == ")" { return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: []) } - let ateCaret = recordLoc { $0.tryEat("^") } + let caret = tryEatWithLoc("^") // TODO: Warn on duplicate options, and options appearing in both adding // and removing lists? var adding: [AST.MatchingOption] = [] - while let opt = try lexMatchingOption() { + while let opt = lexMatchingOption() { adding.append(opt) } - // If the sequence begun with a caret '^', options can only be added, so - // we're done. - if ateCaret.value { - if peek() == "-" { - throw ParseError.cannotRemoveMatchingOptionsAfterCaret - } - return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil, - removing: []) - } - // Try to lex options to remove. - let ateMinus = recordLoc { $0.tryEat("-") } - if ateMinus.value { - var removing: [AST.MatchingOption] = [] - while let opt = try lexMatchingOption() { + var removing: [AST.MatchingOption] = [] + let minus = tryEatWithLoc("-") + if minus != nil { + if let caret = caret { + // Options cannot be removed if '^' is used. + error(.cannotRemoveMatchingOptionsAfterCaret, at: caret) + } + while let opt = lexMatchingOption() { // Text segment options can only be added, they cannot be removed // with (?-), they should instead be set to a different mode. if opt.isTextSegmentMode { - throw ParseError.cannotRemoveTextSegmentOptions + error(.cannotRemoveTextSegmentOptions, at: opt.location) } // Matching semantics options can only be added, not removed. if opt.isSemanticMatchingLevel { - throw ParseError.cannotRemoveSemanticsOptions + error(.cannotRemoveSemanticsOptions, at: opt.location) } removing.append(opt) } - return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, - removing: removing) } - guard !adding.isEmpty else { return nil } - return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) + // We must have lexed at least something to proceed. + guard caret != nil || minus != nil || !adding.isEmpty else { return nil } + return .init( + caretLoc: caret, adding: adding, minusLoc: minus, removing: removing) } /// A matching option changing atom. /// /// '(?' MatchingOptionSeq ')' /// - mutating func lexChangeMatchingOptionAtom( - context: ParsingContext - ) throws -> AST.MatchingOptionSequence? { - try tryEating { src in - guard src.tryEat(sequence: "(?"), - let seq = try src.lexMatchingOptionSequence(context: context) + mutating func lexChangeMatchingOptionAtom() -> AST.MatchingOptionSequence? { + tryEating { p in + guard p.tryEat(sequence: "(?"), let seq = p.lexMatchingOptionSequence() else { return nil } - try src.expect(")") + p.expect(")") return seq } } /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { - tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } - if src.tryEat(sequence: "atomic:") { + if p.tryEat(sequence: "atomic:") { return .atomicNonCapturing } - if src.tryEat(sequence: "pla:") || - src.tryEat(sequence: "positive_lookahead:") { + if p.tryEat(sequence: "pla:") || + p.tryEat(sequence: "positive_lookahead:") { return .lookahead } - if src.tryEat(sequence: "nla:") || - src.tryEat(sequence: "negative_lookahead:") { + if p.tryEat(sequence: "nla:") || + p.tryEat(sequence: "negative_lookahead:") { return .negativeLookahead } - if src.tryEat(sequence: "plb:") || - src.tryEat(sequence: "positive_lookbehind:") { + if p.tryEat(sequence: "plb:") || + p.tryEat(sequence: "positive_lookbehind:") { return .lookbehind } - if src.tryEat(sequence: "nlb:") || - src.tryEat(sequence: "negative_lookbehind:") { + if p.tryEat(sequence: "nlb:") || + p.tryEat(sequence: "negative_lookbehind:") { return .negativeLookbehind } - if src.tryEat(sequence: "napla:") || - src.tryEat(sequence: "non_atomic_positive_lookahead:") { + if p.tryEat(sequence: "napla:") || + p.tryEat(sequence: "non_atomic_positive_lookahead:") { return .nonAtomicLookahead } - if src.tryEat(sequence: "naplb:") || - src.tryEat(sequence: "non_atomic_positive_lookbehind:") { + if p.tryEat(sequence: "naplb:") || + p.tryEat(sequence: "non_atomic_positive_lookbehind:") { return .nonAtomicLookbehind } - if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") { + if p.tryEat(sequence: "sr:") || p.tryEat(sequence: "script_run:") { return .scriptRun } - if src.tryEat(sequence: "asr:") || - src.tryEat(sequence: "atomic_script_run:") { + if p.tryEat(sequence: "asr:") || + p.tryEat(sequence: "atomic_script_run:") { return .atomicScriptRun } return nil @@ -915,34 +946,28 @@ extension Source { /// private mutating func expectIdentifier( _ kind: IdentifierKind, endingWith ending: String, eatEnding: Bool = true - ) throws -> Located { - let str = try recordLoc { src -> String in - if src.isEmpty || src.tryEat(sequence: ending) { - throw ParseError.expectedIdentifier(kind) + ) -> Located { + let str = recordLoc { p -> String in + guard !p.src.isEmpty && !p.src.starts(with: ending) else { + p.errorAtCurrentPosition(.expectedIdentifier(kind)) + return "" } - if src.peek()!.isNumber { - throw ParseError.identifierCannotStartWithNumber(kind) + let firstChar = p.peekWithLoc()! + if firstChar.value.isNumber { + p.error(.identifierCannotStartWithNumber(kind), at: firstChar.location) } - guard let str = src.tryEatPrefix(\.isWordCharacter)?.string else { - throw ParseError.identifierMustBeAlphaNumeric(kind) + guard let str = p.tryEatPrefix(\.isWordCharacter) else { + p.error(.identifierMustBeAlphaNumeric(kind), at: firstChar.location) + return "" } - return str + return str.value } if eatEnding { - try expect(sequence: ending) + expect(sequence: ending) } return str } - /// Try to consume an identifier, returning `nil` if unsuccessful. - private mutating func lexIdentifier( - _ kind: IdentifierKind, endingWith end: String, eatEnding: Bool = true - ) -> Located? { - tryEating { src in - try? src.expectIdentifier(kind, endingWith: end, eatEnding: eatEnding) - } - } - /// Consume a named group field, producing either a named capture or balanced /// capture. /// @@ -953,23 +978,23 @@ extension Source { /// private mutating func expectNamedGroup( endingWith ending: String - ) throws -> AST.Group.Kind { - func lexBalanced(_ lhs: Located? = nil) throws -> AST.Group.Kind? { + ) -> AST.Group.Kind { + func lexBalanced(_ lhs: Located? = nil) -> AST.Group.Kind? { // If we have a '-', this is a .NET-style 'balanced group'. guard let dash = tryEatWithLoc("-") else { return nil } - let rhs = try expectIdentifier(.groupName, endingWith: ending) + let rhs = expectIdentifier(.groupName, endingWith: ending) return .balancedCapture(.init(name: lhs, dash: dash, priorName: rhs)) } // Lex a group name, trying to lex a '-rhs' for a balanced capture group // both before and after. - if let b = try lexBalanced() { return b } - let name = try expectIdentifier( + if let b = lexBalanced() { return b } + let name = expectIdentifier( .groupName, endingWith: ending, eatEnding: false ) - if let b = try lexBalanced(name) { return b } + if let b = lexBalanced(name) { return b } - try expect(sequence: ending) + expect(sequence: ending) return .namedCapture(name) } @@ -990,15 +1015,13 @@ extension Source { /// need to be parsed earlier than the group check, as /// comments, like quotes, cannot be quantified. /// - mutating func lexGroupStart( - context: ParsingContext - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in + mutating func lexGroupStart() -> Located? { + recordLoc { p in + p.tryEating { p in // Explicitly spelled out PRCE2 syntax for some groups. This needs to be // done before group-like atoms, as it uses the '(*' syntax, which is // otherwise a group-like atom. - if let g = src.lexExplicitPCRE2GroupStart() { return g } + if let g = p.lexExplicitPCRE2GroupStart() { return g } // There are some atoms that syntactically look like groups, bail here // if we see any. Care needs to be taken here as e.g a group starting @@ -1006,54 +1029,57 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } + guard !p.shouldLexGroupLikeAtom() else { return nil } - guard src.tryEat("(") else { return nil } - if src.tryEat("?") { - if src.tryEat(":") { return .nonCapture } - if src.tryEat("|") { return .nonCaptureReset } - if src.tryEat(">") { return .atomicNonCapturing } - if src.tryEat("=") { return .lookahead } - if src.tryEat("!") { return .negativeLookahead } - if src.tryEat("*") { return .nonAtomicLookahead } + guard p.tryEat("(") else { return nil } + if p.tryEat("?") { + if p.tryEat(":") { return .nonCapture } + if p.tryEat("|") { return .nonCaptureReset } + if p.tryEat(">") { return .atomicNonCapturing } + if p.tryEat("=") { return .lookahead } + if p.tryEat("!") { return .negativeLookahead } + if p.tryEat("*") { return .nonAtomicLookahead } - if src.tryEat(sequence: "<=") { return .lookbehind } - if src.tryEat(sequence: "") + if p.tryEat("<") || p.tryEat(sequence: "P<") { + return p.expectNamedGroup(endingWith: ">") } - if src.tryEat("'") { - return try src.expectNamedGroup(endingWith: "'") + if p.tryEat("'") { + return p.expectNamedGroup(endingWith: "'") } // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). - if let seq = try src.lexMatchingOptionSequence(context: context) { - guard src.tryEat(":") else { - if let next = src.peek() { - throw ParseError.invalidMatchingOption(next) + if let seq = p.lexMatchingOptionSequence() { + if !p.tryEat(":") { + if let next = p.peekWithLoc() { + p.error(.invalidMatchingOption(next.value), at: next.location) + } else { + p.errorAtCurrentPosition(.expected(")")) } - throw ParseError.expected(")") } return .changeMatchingOptions(seq) } - guard let next = src.peek() else { - throw ParseError.expectedGroupSpecifier + if let next = p.peekWithLoc() { + p.error(.unknownGroupKind("?\(next.value)"), at: next.location) + } else { + p.errorAtCurrentPosition(.expectedGroupSpecifier) } - throw ParseError.unknownGroupKind("?\(next)") + return .nonCapture } // (_:) - if context.experimentalCaptures && src.tryEat(sequence: "_:") { + if p.context.experimentalCaptures && p.tryEat(sequence: "_:") { return .nonCapture } // TODO: (name:) // If (?n) is set, a bare (...) group is non-capturing. - if context.syntax.contains(.namedCapturesOnly) { + if p.context.syntax.contains(.namedCapturesOnly) { return .nonCapture } return .capture @@ -1066,12 +1092,12 @@ extension Source { /// PCREVersionNumber -> . /// private mutating func expectPCREVersionNumber( - ) throws -> AST.Conditional.Condition.PCREVersionNumber { - let nums = try recordLoc { src -> (major: AST.Atom.Number, - minor: AST.Atom.Number) in - let major = try src.expectNumber() - try src.expect(".") - let minor = try src.expectNumber() + ) -> AST.Conditional.Condition.PCREVersionNumber { + let nums = recordLoc { p -> (major: AST.Atom.Number, + minor: AST.Atom.Number) in + let major = p.expectNumber() + p.expect(".") + let minor = p.expectNumber() return (major, minor) } return .init(major: nums.value.major, minor: nums.value.minor, @@ -1083,14 +1109,14 @@ extension Source { /// PCREVersionCheck -> '>'? '=' PCREVersionNumber /// private mutating func expectPCREVersionCheck( - ) throws -> AST.Conditional.Condition.Kind { + ) -> AST.Conditional.Condition.Kind { typealias Kind = AST.Conditional.Condition.PCREVersionCheck.Kind - let kind = try recordLoc { src -> Kind in - let greaterThan = src.tryEat(">") - try src.expect("=") + let kind = recordLoc { p -> Kind in + let greaterThan = p.tryEat(">") + p.expect("=") return greaterThan ? .greaterThanOrEqual : .equal } - return .pcreVersionCheck(.init(kind, try expectPCREVersionNumber())) + return .pcreVersionCheck(.init(kind, expectPCREVersionNumber())) } /// Try to lex a known condition (excluding group conditions). @@ -1105,46 +1131,44 @@ extension Source { /// | NumberRef /// | NameRef /// - private mutating func lexKnownCondition( - context: ParsingContext - ) throws -> AST.Conditional.Condition? { + private mutating func lexKnownCondition() -> AST.Conditional.Condition? { typealias ConditionKind = AST.Conditional.Condition.Kind - let kind = try recordLoc { src -> ConditionKind? in - try src.tryEating { src in + let kind = recordLoc { p -> ConditionKind? in + p.tryEating { p in // PCRE recursion check. - if src.tryEat("R") { - if src.tryEat("&") { + if p.tryEat("R") { + if p.tryEat("&") { return .groupRecursionCheck( - try src.expectNamedReference(endingWith: ")", eatEnding: false)) + p.expectNamedReference(endingWith: ")", eatEnding: false)) } - if let num = try src.lexNumber() { + if let num = p.lexNumber() { return .groupRecursionCheck( .init(.absolute(num), innerLoc: num.location)) } return .recursionCheck } - if let open = src.tryEat(anyOf: "<", "'") { + if let open = p.tryEat(anyOf: "<", "'") { // In PCRE, this can only be a named reference. In Oniguruma, it can // also be a numbered reference. - let closing = String(Source.getClosingDelimiter(for: open)) + let closing = String(p.getClosingDelimiter(for: open)) return .groupMatched( - try src.expectNamedOrNumberedReference(endingWith: closing)) + p.expectNamedOrNumberedReference(endingWith: closing)) } // PCRE group definition and version check. - if src.tryEat(sequence: "DEFINE") { + if p.tryEat(sequence: "DEFINE") { return .defineGroup } - if src.tryEat(sequence: "VERSION") { - return try src.expectPCREVersionCheck() + if p.tryEat(sequence: "VERSION") { + return p.expectPCREVersionCheck() } // If we have a numbered reference, this is a check to see if a group // matched. Oniguruma also permits a recursion level here. - if let num = try src.lexNumberedReference(allowRecursionLevel: true) { + if let num = p.lexNumberedReference(allowRecursionLevel: true) { return .groupMatched(num) } @@ -1155,9 +1179,9 @@ extension Source { // FIXME: This should apply to future groups too. // TODO: We should probably advise users to use the more explicit // syntax. - let nameRef = src.lexNamedReference( + let nameRef = p.lexNamedReference( endingWith: ")", eatEnding: false, allowRecursionLevel: true) - if let nameRef = nameRef, context.isPriorGroupRef(nameRef.kind) { + if let nameRef = nameRef, p.context.isPriorGroupRef(nameRef.kind) { return .groupMatched(nameRef) } return nil @@ -1171,14 +1195,11 @@ extension Source { /// /// KnownConditionalStart -> '(?(' KnownCondition ')' /// - mutating func lexKnownConditionalStart( - context: ParsingContext - ) throws -> AST.Conditional.Condition? { - try tryEating { src in - guard src.tryEat(sequence: "(?("), - let cond = try src.lexKnownCondition(context: context) + mutating func lexKnownConditionalStart() -> AST.Conditional.Condition? { + tryEating { p in + guard p.tryEat(sequence: "(?("), let cond = p.lexKnownCondition() else { return nil } - try src.expect(")") + p.expect(")") return cond } } @@ -1187,12 +1208,10 @@ extension Source { /// /// GroupCondStart -> '(?' GroupStart /// - mutating func lexGroupConditionalStart( - context: ParsingContext - ) throws -> Located? { - try tryEating { src in - guard src.tryEat(sequence: "(?") else { return nil } - return try src.lexGroupStart(context: context) + mutating func lexGroupConditionalStart() -> Located? { + tryEating { p in + guard p.tryEat(sequence: "(?") else { return nil } + return p.lexGroupStart() } } @@ -1202,24 +1221,24 @@ extension Source { /// mutating func lexAbsentFunctionStart( ) -> Located? { - recordLoc { src in - if src.tryEat(sequence: "(?~|") { return .withPipe } - if src.tryEat(sequence: "(?~") { return .withoutPipe } + recordLoc { p in + if p.tryEat(sequence: "(?~|") { return .withPipe } + if p.tryEat(sequence: "(?~") { return .withoutPipe } return nil } } mutating func lexCustomCCStart() -> Located? { - recordLoc { src in + recordLoc { p in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise // we have a custom character class. // TODO: This behavior seems subtle, could we warn? - guard !src.canLexPOSIXCharacterProperty() else { + guard !p.canLexPOSIXCharacterProperty() else { return nil } - if src.tryEat("[") { - return src.tryEat("^") ? .inverted : .normal + if p.tryEat("[") { + return p.tryEat("^") ? .inverted : .normal } return nil } @@ -1229,21 +1248,21 @@ extension Source { /// /// CustomCCBinOp -> '--' | '~~' | '&&' /// - mutating func lexCustomCCBinOp() throws -> Located? { - recordLoc { src in + mutating func lexCustomCCBinOp() -> Located? { + recordLoc { p in // TODO: Perhaps a syntax options check (!PCRE) // TODO: Better AST types here - guard let binOp = src.peekCCBinOp() else { return nil } - try! src.expect(sequence: binOp.rawValue) + guard let binOp = p.peekCCBinOp() else { return nil } + p.expect(sequence: binOp.rawValue) return binOp } } // Check to see if we can lex a binary operator. func peekCCBinOp() -> CustomCC.SetOp? { - if starts(with: "--") { return .subtraction } - if starts(with: "~~") { return .symmetricDifference } - if starts(with: "&&") { return .intersection } + if src.starts(with: "--") { return .subtraction } + if src.starts(with: "~~") { return .symmetricDifference } + if src.starts(with: "&&") { return .intersection } return nil } @@ -1252,52 +1271,40 @@ extension Source { /// /// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass /// - func canLexDotNetCharClassSubtraction( - context: ParsingContext - ) -> SourceLocation? { - lookahead { src in + mutating func canLexDotNetCharClassSubtraction() -> SourceLocation? { + lookahead { p in // We can lex '-' as a .NET subtraction if it precedes a custom character // class. - while (try? src.lexTrivia(context: context)) != nil {} - guard let dashLoc = src.tryEatWithLoc("-") else { return nil } - while (try? src.lexTrivia(context: context)) != nil {} - guard src.lexCustomCCStart() != nil else { return nil } + while p.lexTrivia() != nil {} + guard let dashLoc = p.tryEatWithLoc("-") else { return nil } + while p.lexTrivia() != nil {} + guard p.lexCustomCCStart() != nil else { return nil } return dashLoc } } private mutating func lexPOSIXCharacterProperty( - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard src.tryEat(sequence: "[:") else { return nil } - let inverted = src.tryEat("^") + ) -> Located? { + recordLoc { p in + p.tryEating { p in + guard p.tryEat(sequence: "[:") else { return nil } + let inverted = p.tryEat("^") // Note we lex the contents and ending *before* classifying, because we // want to bail with nil if we don't have the right ending. This allows // the lexing of a custom character class if we don't have a ':]' // ending. - let (key, value) = src.lexCharacterPropertyKeyValue() - guard src.tryEat(sequence: ":]") else { return nil } + let (key, value) = p.lexCharacterPropertyKeyValue() + guard p.tryEat(sequence: ":]") else { return nil } - let prop = try Source.classifyCharacterPropertyContents(key: key, - value: value) + let prop = p.classifyCharacterPropertyContents(key: key, value: value) return .init(prop, isInverted: inverted, isPOSIX: true) } } } - private func canLexPOSIXCharacterProperty() -> Bool { - do { - return try lookahead { src in - try src.lexPOSIXCharacterProperty() != nil - } - } catch { - // We want to tend on the side of lexing a POSIX character property, so - // even if it is invalid in some way (e.g invalid property names), still - // try and lex it. - return true - } + private mutating func canLexPOSIXCharacterProperty() -> Bool { + lookahead { $0.lexPOSIXCharacterProperty() != nil } } /// Try to consume a named character. @@ -1305,26 +1312,26 @@ extension Source { /// NamedCharacter -> '\N{' CharName '}' /// CharName -> 'U+' HexDigit{1...8} | [\s\w-]+ /// - private mutating func lexNamedCharacter() throws -> Located? { - try recordLoc { src in - guard src.tryEat(sequence: "N{") else { return nil } + private mutating func lexNamedCharacter() -> Located? { + recordLoc { p in + guard p.tryEat(sequence: "N{") else { return nil } // We should either have a unicode scalar. - if src.tryEat(sequence: "U+") { - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) + if p.tryEat(sequence: "U+") { + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .hex)) } // Or we should have a character name. // TODO: Validate the types of characters that can appear in the name? - return .namedCharacter(try src.lexUntil(eating: "}").value) + return .namedCharacter(p.lexUntil(eating: "}").value) } } private mutating func lexCharacterPropertyKeyValue( - ) -> (key: String?, value: String) { - func atPossibleEnding(_ src: inout Source) -> Bool { - guard let next = src.peek() else { return true } + ) -> (key: Located?, value: Located) { + func atPossibleEnding(_ p: inout Self) -> Bool { + guard let next = p.peek() else { return true } switch next { case "=": // End of a key. @@ -1360,21 +1367,21 @@ extension Source { // - 'x=y' where 'x' is a property key, and 'y' is a value. // - 'y' where 'y' is a value (or a bool key with an inferred value of true) // and its key is inferred. - let lhs = lexUntil(atPossibleEnding).value + let lhs = lexUntil(atPossibleEnding) if tryEat("=") { - let rhs = lexUntil(atPossibleEnding).value + let rhs = lexUntil(atPossibleEnding) return (lhs, rhs) } return (nil, lhs) } - private static func classifyCharacterPropertyContents( - key: String?, value: String - ) throws -> AST.Atom.CharacterProperty.Kind { + private mutating func classifyCharacterPropertyContents( + key: Located?, value: Located + ) -> AST.Atom.CharacterProperty.Kind { if let key = key { - return try classifyCharacterProperty(key: key, value: value) + return classifyCharacterProperty(key: key, value: value) } - return try classifyCharacterPropertyValueOnly(value) + return classifyCharacterPropertyValueOnly(value) } /// Try to consume a character property. @@ -1383,17 +1390,18 @@ extension Source { /// Prop -> [\s\w-]+ /// private mutating func lexCharacterProperty( - ) throws -> Located? { - try recordLoc { src in + ) -> Located? { + recordLoc { p in // '\P{...}' is the inverted version of '\p{...}' - guard src.starts(with: "p{") || src.starts(with: "P{") else { return nil } - let isInverted = src.peek() == "P" - src.advance(2) - - let (key, value) = src.lexCharacterPropertyKeyValue() - let prop = try Source.classifyCharacterPropertyContents(key: key, - value: value) - try src.expect("}") + guard p.src.starts(with: "p{") || p.src.starts(with: "P{") else { + return nil + } + let isInverted = p.peek() == "P" + p.advance(2) + + let (key, value) = p.lexCharacterPropertyKeyValue() + let prop = p.classifyCharacterPropertyContents(key: key, value: value) + p.expect("}") return .init(prop, isInverted: isInverted, isPOSIX: false) } } @@ -1404,28 +1412,28 @@ extension Source { /// private mutating func lexNumberedReference( allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false - ) throws -> AST.Reference? { - let kind = try recordLoc { src -> AST.Reference.Kind? in - try src.tryEating { src in + ) -> AST.Reference? { + let kind = recordLoc { p -> AST.Reference.Kind? in + p.tryEating { p in // Note this logic should match canLexNumberedReference. - if let plus = src.tryEatWithLoc("+"), let num = try src.lexNumber() { + if let plus = p.tryEatWithLoc("+"), let num = p.lexNumber() { return .relative(.init(num.value, at: num.location.union(with: plus))) } - if let minus = src.tryEatWithLoc("-"), let num = try src.lexNumber() { + if let minus = p.tryEatWithLoc("-"), let num = p.lexNumber() { let val = num.value.map { x in -x } return .relative(.init(val, at: num.location.union(with: minus))) } - if let num = try src.lexNumber() { + if let num = p.lexNumber() { return .absolute(num) } return nil } } guard let kind = kind else { return nil } - guard allowWholePatternRef || !kind.value.recursesWholePattern else { - throw ParseError.cannotReferToWholePattern + if !allowWholePatternRef && kind.value.recursesWholePattern { + error(.cannotReferToWholePattern, at: kind.location) } - let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil + let recLevel = allowRecursionLevel ? lexRecursionLevel() : nil let loc = recLevel?.location.union(with: kind.location) ?? kind.location return .init(kind.value, recursionLevel: recLevel, innerLoc: loc) } @@ -1435,10 +1443,10 @@ extension Source { /// RecursionLevel -> '+' | '-' /// private mutating func lexRecursionLevel( - ) throws -> AST.Atom.Number? { - let value = try recordLoc { src -> Int? in - if src.tryEat("+") { return try src.expectNumber().value } - if src.tryEat("-") { return try src.expectNumber().value.map { x in -x } } + ) -> AST.Atom.Number? { + let value = recordLoc { p -> Int? in + if p.tryEat("+") { return p.expectNumber().value } + if p.tryEat("-") { return p.expectNumber().value.map { x in -x } } return nil } guard let value = value else { return nil } @@ -1446,10 +1454,10 @@ extension Source { } /// Checks whether a numbered reference can be lexed. - private func canLexNumberedReference() -> Bool { - lookahead { src in - _ = src.tryEat(anyOf: "+", "-") - guard let next = src.peek() else { return false } + private mutating func canLexNumberedReference() -> Bool { + lookahead { p in + _ = p.tryEat(anyOf: "+", "-") + guard let next = p.peek() else { return false } return RadixKind.decimal.characterFilter(next) } } @@ -1458,18 +1466,18 @@ extension Source { private mutating func expectNamedReference( endingWith end: String, eatEnding: Bool = true, allowRecursionLevel: Bool = false - ) throws -> AST.Reference { + ) -> AST.Reference { // Note we don't want to eat the ending as we may also want to parse a // recursion level. - let str = try expectIdentifier( + let str = expectIdentifier( .groupName, endingWith: end, eatEnding: false) - // If we're allowed to, try parse a recursion level. - let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil + // If we're allowed to, parse a recursion level. + let recLevel = allowRecursionLevel ? lexRecursionLevel() : nil let loc = recLevel?.location.union(with: str.location) ?? str.location if eatEnding { - try expect(sequence: end) + expect(sequence: end) } return .init(.named(str.value), recursionLevel: recLevel, innerLoc: loc) } @@ -1480,8 +1488,8 @@ extension Source { endingWith end: String, eatEnding: Bool = true, allowRecursionLevel: Bool = false ) -> AST.Reference? { - tryEating { src in - try? src.expectNamedReference( + tryEating { p in + p.expectNamedReference( endingWith: end, eatEnding: eatEnding, allowRecursionLevel: allowRecursionLevel ) @@ -1495,32 +1503,34 @@ extension Source { private mutating func expectNamedOrNumberedReference( endingWith ending: String, eatEnding: Bool = true, allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false - ) throws -> AST.Reference { - let num = try lexNumberedReference( + ) -> AST.Reference { + let num = lexNumberedReference( allowWholePatternRef: allowWholePatternRef, allowRecursionLevel: allowRecursionLevel ) if let num = num { if eatEnding { - try expect(sequence: ending) + expect(sequence: ending) } return num } - return try expectNamedReference( + return expectNamedReference( endingWith: ending, eatEnding: eatEnding, allowRecursionLevel: allowRecursionLevel ) } - private static func getClosingDelimiter( + private mutating func getClosingDelimiter( for openChar: Character ) -> Character { switch openChar { // Identically-balanced delimiters. - case "'", "\"", "`", "^", "%", "#", "$": return openChar - case "<": return ">" - case "{": return "}" - default: fatalError("Not implemented") + case "'", "\"", "`", "^", "%", "#", "$": return openChar + case "<": return ">" + case "{": return "}" + default: + unreachable("Unhandled case") + return openChar } } @@ -1535,56 +1545,53 @@ extension Source { /// | 'k{' '}' /// | [1-9] [0-9]+ /// - private mutating func lexEscapedReference( - context: ParsingContext - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard let firstChar = src.peek() else { return nil } + private mutating func lexEscapedReference() -> Located? { + recordLoc { p in + p.tryEating { p in + guard let firstChar = p.peek() else { return nil } - if src.tryEat("g") { + if p.tryEat("g") { // PCRE-style backreferences. - if src.tryEat("{") { - let ref = try src.expectNamedOrNumberedReference(endingWith: "}") + if p.tryEat("{") { + let ref = p.expectNamedOrNumberedReference(endingWith: "}") return .backreference(ref) } // Oniguruma-style subpatterns. - if let openChar = src.tryEat(anyOf: "<", "'") { - let closing = String(Source.getClosingDelimiter(for: openChar)) - return .subpattern(try src.expectNamedOrNumberedReference( + if let openChar = p.tryEat(anyOf: "<", "'") { + let closing = String(p.getClosingDelimiter(for: openChar)) + return .subpattern(p.expectNamedOrNumberedReference( endingWith: closing, allowWholePatternRef: true)) } // PCRE allows \g followed by a bare numeric reference. - if let ref = try src.lexNumberedReference() { + if let ref = p.lexNumberedReference() { return .backreference(ref) } return nil } - if src.tryEat("k") { + if p.tryEat("k") { // Perl/.NET/Oniguruma-style backreferences. - if let openChar = src.tryEat(anyOf: "<", "'") { - let closing = String(Source.getClosingDelimiter(for: openChar)) + if let openChar = p.tryEat(anyOf: "<", "'") { + let closing = String(p.getClosingDelimiter(for: openChar)) // Perl only accept named references here, but Oniguruma and .NET // also accepts numbered references. This shouldn't be an ambiguity // as named references may not begin with a digit, '-', or '+'. // Oniguruma also allows a recursion level to be specified. - return .backreference(try src.expectNamedOrNumberedReference( + return .backreference(p.expectNamedOrNumberedReference( endingWith: closing, allowRecursionLevel: true)) } // Perl/.NET also allow a named references with the '{' delimiter. - if src.tryEat("{") { - return .backreference( - try src.expectNamedReference(endingWith: "}")) + if p.tryEat("{") { + return .backreference(p.expectNamedReference(endingWith: "}")) } return nil } // Backslash followed by a non-0 digit character is a backreference. - if firstChar != "0", let num = try src.lexNumber() { + if firstChar != "0", let num = p.lexNumber() { return .backreference(.init(.absolute(num), innerLoc: num.location)) } return nil @@ -1602,35 +1609,35 @@ extension Source { /// | NumberRef /// private mutating func lexGroupLikeReference( - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard src.tryEat(sequence: "(?") else { return nil } + ) -> Located? { + recordLoc { p in + p.tryEating { p in + guard p.tryEat(sequence: "(?") else { return nil } // Note the below should be covered by canLexGroupLikeReference. // Python-style references. - if src.tryEat(sequence: "P=") { - return .backreference(try src.expectNamedReference(endingWith: ")")) + if p.tryEat(sequence: "P=") { + return .backreference(p.expectNamedReference(endingWith: ")")) } - if src.tryEat(sequence: "P>") { - return .subpattern(try src.expectNamedReference(endingWith: ")")) + if p.tryEat(sequence: "P>") { + return .subpattern(p.expectNamedReference(endingWith: ")")) } // Perl-style subpatterns. - if src.tryEat("&") { - return .subpattern(try src.expectNamedReference(endingWith: ")")) + if p.tryEat("&") { + return .subpattern(p.expectNamedReference(endingWith: ")")) } // Whole-pattern recursion, which is equivalent to (?0). - if let loc = src.tryEatWithLoc("R") { - try src.expect(")") + if let loc = p.tryEatWithLoc("R") { + p.expect(")") return .subpattern(.init(.recurseWholePattern(loc), innerLoc: loc)) } // Numbered subpattern reference. - if let ref = try src.lexNumberedReference(allowWholePatternRef: true) { - try src.expect(")") + if let ref = p.lexNumberedReference(allowWholePatternRef: true) { + p.expect(")") return .subpattern(ref) } return nil @@ -1639,53 +1646,51 @@ extension Source { } /// Whether we can lex a group-like reference after the specifier '(?'. - private func canLexGroupLikeReference() -> Bool { - lookahead { src in - if src.tryEat("P") { - return src.tryEat(anyOf: "=", ">") != nil + private mutating func canLexGroupLikeReference() -> Bool { + lookahead { p in + if p.tryEat("P") { + return p.tryEat(anyOf: "=", ">") != nil } - if src.tryEat(anyOf: "&", "R") != nil { + if p.tryEat(anyOf: "&", "R") != nil { return true } - return src.canLexNumberedReference() + return p.canLexNumberedReference() } } - private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { - lookahead { src in + private mutating func canLexMatchingOptionsAsAtom() -> Bool { + lookahead { p in // See if we can lex a matching option sequence that terminates in ')'. - // Such a sequence is an atom. If an error is thrown, there are invalid - // elements of the matching option sequence. In such a case, we can lex as - // a group and diagnose the invalid group kind. - guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + // Such a sequence is an atom. + guard p.lexMatchingOptionSequence() != nil else { return false } - return src.tryEat(")") + return p.tryEat(")") } } /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { - lookahead { src in - guard src.tryEat("(") else { return false } + private mutating func shouldLexGroupLikeAtom() -> Bool { + lookahead { p in + guard p.tryEat("(") else { return false } - if src.tryEat("?") { + if p.tryEat("?") { // The start of a reference '(?P=', '(?R', ... - if src.canLexGroupLikeReference() { return true } + if p.canLexGroupLikeReference() { return true } // The start of a PCRE callout. - if src.tryEat("C") { return true } + if p.tryEat("C") { return true } // The start of an Oniguruma 'of-contents' callout. - if src.tryEat("{") { return true } + if p.tryEat("{") { return true } // A matching option atom (?x), (?i), ... - if src.canLexMatchingOptionsAsAtom(context: context) { return true } + if p.canLexMatchingOptionsAsAtom() { return true } return false } // The start of a backreference directive or Oniguruma named callout. - if src.tryEat("*") { return true } + if p.tryEat("*") { return true } return false } @@ -1697,47 +1702,50 @@ extension Source { /// | UniScalar | Property | NamedCharacter /// | EscapedReference /// - mutating func expectEscaped( - context: ParsingContext - ) throws -> Located { - try recordLoc { src in - let ccc = context.isInCustomCharacterClass + mutating func expectEscaped() -> Located { + recordLoc { p in + let ccc = p.context.isInCustomCharacterClass // Keyboard control/meta - if src.tryEat("c") || src.tryEat(sequence: "C-") { - return .keyboardControl(try src.expectASCII().value) + if p.tryEat("c") || p.tryEat(sequence: "C-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardControl(ascii.value) } - if src.tryEat(sequence: "M-\\C-") { - return .keyboardMetaControl(try src.expectASCII().value) + if p.tryEat(sequence: "M-\\C-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardMetaControl(ascii.value) } - if src.tryEat(sequence: "M-") { - return .keyboardMeta(try src.expectASCII().value) + if p.tryEat(sequence: "M-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardMeta(ascii.value) } // Named character '\N{...}'. - if let char = try src.lexNamedCharacter() { + if let char = p.lexNamedCharacter() { return char.value } // Character property \p{...} \P{...}. - if let prop = try src.lexCharacterProperty() { + if let prop = p.lexCharacterProperty() { return .property(prop.value) } // References using escape syntax, e.g \1, \g{1}, \k<...>, ... // These are not valid inside custom character classes. - if !ccc, let ref = try src.lexEscapedReference(context: context)?.value { + if !ccc, let ref = p.lexEscapedReference()?.value { return ref } // Hexadecimal and octal unicode scalars. - if let scalar = try src.lexUnicodeScalar() { + if let scalar = p.lexUnicodeScalar() { return scalar } - guard let char = src.tryEat() else { - throw ParseError.expectedEscape + guard let charLoc = p.tryEatWithLoc() else { + p.errorAtCurrentPosition(.expectedEscape) + return .invalid } + let char = charLoc.value // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( @@ -1749,10 +1757,9 @@ extension Source { // We only allow unknown escape sequences for non-letter non-number ASCII, // and non-ASCII whitespace. // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. - guard (char.isASCII && !char.isLetter && !char.isNumber) || - (!char.isASCII && char.isWhitespace) - else { - throw ParseError.invalidEscape(char) + if (char.isASCII && (char.isLetter || char.isNumber)) || + (!char.isASCII && !char.isWhitespace) { + p.error(.invalidEscape(char), at: charLoc.location) } return .char(char) } @@ -1771,33 +1778,34 @@ extension Source { /// | '$' '$' /// | '{' '}' /// - mutating func lexPCRECallout() throws -> AST.Atom.Callout? { + mutating func lexPCRECallout() -> AST.Atom.Callout? { guard tryEat(sequence: "(?C") else { return nil } - let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in + let arg = recordLoc { p -> AST.Atom.Callout.PCRE.Argument in // Parse '(?C' followed by a number. - if let num = try src.lexNumber() { + if let num = p.lexNumber() { return .number(num) } // '(?C)' is implicitly '(?C0)'. - if src.peek() == ")" { - let pos = src.currentPosition - return .number(.init(0, at: SourceLocation(pos ..< pos))) + if p.peek() == ")" { + return .number(.init(0, at: p.loc(p.src.currentPosition))) } // Parse '(C?' followed by a set of balanced delimiters as defined by // http://pcre.org/current/doc/html/pcre2pattern.html#SEC28 - if let open = src.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") { - let closing = String(Source.getClosingDelimiter(for: open)) - return .string(try src.expectQuoted(endingWith: closing).value) + if let open = p.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") { + let closing = String(p.getClosingDelimiter(for: open)) + return .string(p.expectQuoted(endingWith: closing).value) } // If we don't know what this syntax is, consume up to the ending ')' and // emit an error. - let remaining = src.lexUntil { $0.isEmpty || $0.tryEat(")") }.value - if remaining.isEmpty { - throw ParseError.expected(")") + let remaining = p.lexUntil { $0.src.isEmpty || $0.peek() == ")" } + if p.src.isEmpty && remaining.value.isEmpty { + p.errorAtCurrentPosition(.expected(")")) + } else { + p.error(.unknownCalloutKind("(?C\(remaining.value))"), at: remaining.location) } - throw ParseError.unknownCalloutKind("(?C\(remaining))") + return .string(remaining.value) } - try expect(")") + expect(")") return .pcre(.init(arg)) } @@ -1808,22 +1816,24 @@ extension Source { /// mutating func expectOnigurumaCalloutArgList( leftBrace: SourceLocation - ) throws -> AST.Atom.Callout.OnigurumaNamed.ArgList { + ) -> AST.Atom.Callout.OnigurumaNamed.ArgList { var args: [Located] = [] while true { - let arg = try recordLoc { src -> String in + let arg = recordLoc { p -> String? in // TODO: Warn about whitespace being included? - guard let arg = src.tryEatPrefix({ $0 != "," && $0 != "}" }) else { - throw ParseError.expectedCalloutArgument + guard let arg = p.tryEatPrefix({ $0 != "," && $0 != "}" }) else { + p.errorAtCurrentPosition(.expectedCalloutArgument) + return nil } - return arg.string + return arg.value } - args.append(arg) - - if peek() == "}" { break } - try expect(",") + if let arg = arg { + args.append(arg) + } + if src.isEmpty || peek() == "}" { break } + expect(",") } - let rightBrace = try expect("}") + let rightBrace = expectWithLoc("}").location return .init(leftBrace, args, rightBrace) } @@ -1832,12 +1842,12 @@ extension Source { /// OnigurumaTag -> '[' Identifier ']' /// mutating func lexOnigurumaCalloutTag( - ) throws -> AST.Atom.Callout.OnigurumaTag? { + ) -> AST.Atom.Callout.OnigurumaTag? { guard let leftBracket = tryEatWithLoc("[") else { return nil } - let name = try expectIdentifier( + let name = expectIdentifier( .onigurumaCalloutTag, endingWith: "]", eatEnding: false ) - let rightBracket = try expect("]") + let rightBracket = expectWithLoc("]").location return .init(leftBracket, name, rightBracket) } @@ -1846,19 +1856,18 @@ extension Source { /// OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? Args? ')' /// Args -> '{' OnigurumaCalloutArgList '}' /// - mutating func lexOnigurumaNamedCallout() throws -> AST.Atom.Callout? { - try tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } - guard let name = src.lexIdentifier( + mutating func lexOnigurumaNamedCallout() -> AST.Atom.Callout? { + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } + let name = p.expectIdentifier( .onigurumaCalloutName, endingWith: ")", eatEnding: false) - else { return nil } - let tag = try src.lexOnigurumaCalloutTag() + let tag = p.lexOnigurumaCalloutTag() - let args = try src.tryEatWithLoc("{").map { - try src.expectOnigurumaCalloutArgList(leftBrace: $0) + let args = p.tryEatWithLoc("{").map { + p.expectOnigurumaCalloutArgList(leftBrace: $0) } - try src.expect(")") + p.expect(")") return .onigurumaNamed(.init(name, tag: tag, args: args)) } } @@ -1869,32 +1878,33 @@ extension Source { /// Contents -> /// Direction -> 'X' | '<' | '>' /// - mutating func lexOnigurumaCalloutOfContents() throws -> AST.Atom.Callout? { - try tryEating { src in - guard src.tryEat(sequence: "(?"), - let openBraces = src.tryEatPrefix({ $0 == "{" }) + mutating func lexOnigurumaCalloutOfContents() -> AST.Atom.Callout? { + tryEating { p in + guard p.tryEat(sequence: "(?"), + let openBraces = p.tryEatPrefix({ $0 == "{" }) else { return nil } - let contents = try src.expectQuoted( - endingWith: "}", count: openBraces.count) + let contents = p.expectQuoted( + endingWith: "}", count: openBraces.value.count) let closeBraces = SourceLocation( - contents.location.end ..< src.currentPosition) + contents.location.end ..< p.src.currentPosition) - let tag = try src.lexOnigurumaCalloutTag() + let tag = p.lexOnigurumaCalloutTag() typealias Direction = AST.Atom.Callout.OnigurumaOfContents.Direction - let direction = src.recordLoc { src -> Direction in - if src.tryEat(">") { return .inProgress } - if src.tryEat("<") { return .inRetraction } - if src.tryEat("X") { return .both } + let direction = p.recordLoc { p -> Direction in + if p.tryEat(">") { return .inProgress } + if p.tryEat("<") { return .inRetraction } + if p.tryEat("X") { return .both } // The default is in-progress. return .inProgress } - try src.expect(")") + p.expect(")") - let openBracesLoc = SourceLocation(from: openBraces) return .onigurumaOfContents(.init( - openBracesLoc, contents, closeBraces, tag: tag, direction: direction)) + openBraces.location, contents, closeBraces, tag: tag, + direction: direction + )) } } @@ -1905,94 +1915,93 @@ extension Source { /// | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN' /// mutating func lexBacktrackingDirective( - ) throws -> AST.Atom.BacktrackingDirective? { - try tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } - let kind = src.recordLoc { src -> AST.Atom.BacktrackingDirective.Kind? in - if src.tryEat(sequence: "ACCEPT") { return .accept } - if src.tryEat(sequence: "FAIL") || src.tryEat("F") { return .fail } - if src.tryEat(sequence: "MARK") || src.peek() == ":" { return .mark } - if src.tryEat(sequence: "COMMIT") { return .commit } - if src.tryEat(sequence: "PRUNE") { return .prune } - if src.tryEat(sequence: "SKIP") { return .skip } - if src.tryEat(sequence: "THEN") { return .then } + ) -> AST.Atom.BacktrackingDirective? { + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } + let kind = p.recordLoc { p -> AST.Atom.BacktrackingDirective.Kind? in + if p.tryEat(sequence: "ACCEPT") { return .accept } + if p.tryEat(sequence: "FAIL") || p.tryEat("F") { return .fail } + if p.tryEat(sequence: "MARK") || p.peek() == ":" { return .mark } + if p.tryEat(sequence: "COMMIT") { return .commit } + if p.tryEat(sequence: "PRUNE") { return .prune } + if p.tryEat(sequence: "SKIP") { return .skip } + if p.tryEat(sequence: "THEN") { return .then } return nil } guard let kind = kind else { return nil } var name: Located? - if src.tryEat(":") { + if p.tryEat(":") { // TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the // name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x). - name = try src.expectQuoted(endingWith: ")", eatEnding: false) + name = p.expectQuoted(endingWith: ")", eatEnding: false) } - try src.expect(")") + p.expect(")") // MARK directives must be named. if name == nil && kind.value == .mark { - throw ParseError.backtrackingDirectiveMustHaveName( - String(src[kind.location.range])) + let kindStr = String(p.src[kind.location.range]) + p.error(.backtrackingDirectiveMustHaveName(kindStr), at: kind.location) } return .init(kind, name: name) } } - /// Consume a group-like atom, throwing an error if an atom could not be + /// Consume a group-like atom, diagnosing an error if an atom could not be /// produced. /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom( - context: ParsingContext - ) throws -> AST.Atom.Kind { - try recordLoc { src in - // References that look like groups, e.g (?R), (?1), ... - if let ref = try src.lexGroupLikeReference() { - return ref.value - } + mutating func expectGroupLikeAtom() -> AST.Atom.Kind { + // References that look like groups, e.g (?R), (?1), ... + if let ref = lexGroupLikeReference() { + return ref.value + } - // Change matching options atom (?i), (?x-i), ... - if let seq = try src.lexChangeMatchingOptionAtom(context: context) { - return .changeMatchingOptions(seq) - } + // Change matching options atom (?i), (?x-i), ... + if let seq = lexChangeMatchingOptionAtom() { + return .changeMatchingOptions(seq) + } - // (*ACCEPT), (*FAIL), (*MARK), ... - if let b = try src.lexBacktrackingDirective() { - return .backtrackingDirective(b) - } + // (*ACCEPT), (*FAIL), (*MARK), ... + if let b = lexBacktrackingDirective() { + return .backtrackingDirective(b) + } - // Global matching options can only appear at the very start. - if let opt = try src.lexGlobalMatchingOption() { - throw ParseError.globalMatchingOptionNotAtStart( - String(src[opt.location.range])) - } + // Global matching options can only appear at the very start. + if let opt = lexGlobalMatchingOption() { + let optStr = String(src[opt.location.range]) + error(.globalMatchingOptionNotAtStart(optStr), at: opt.location) + return .invalid + } - // (?C) - if let callout = try src.lexPCRECallout() { - return .callout(callout) - } + // (?C) + if let callout = lexPCRECallout() { + return .callout(callout) + } - // Try to consume an Oniguruma named callout '(*name)', which should be - // done after backtracking directives and global options. - if let callout = try src.lexOnigurumaNamedCallout() { - return .callout(callout) - } + // Try to consume an Oniguruma named callout '(*name)', which should be + // done after backtracking directives and global options. + if let callout = lexOnigurumaNamedCallout() { + return .callout(callout) + } - // (?{...}) - if let callout = try src.lexOnigurumaCalloutOfContents() { - return .callout(callout) - } + // (?{...}) + if let callout = lexOnigurumaCalloutOfContents() { + return .callout(callout) + } - // If we didn't produce an atom, consume up until a reasonable end-point - // and throw an error. - try src.expect("(") - let remaining = src.lexUntil { - $0.isEmpty || $0.tryEat(anyOf: ":", ")") != nil - }.value - if remaining.isEmpty { - throw ParseError.expected(")") - } - throw ParseError.unknownGroupKind(remaining) - }.value + // If we didn't produce an atom, consume up until a reasonable end-point + // and diagnose an error. + expect("(") + let remaining = lexUntil { + $0.src.isEmpty || $0.tryEat(anyOf: ":", ")") != nil + } + if remaining.value.isEmpty { + error(.expected(")"), at: remaining.location) + } else { + error(.unknownGroupKind(remaining.value), at: remaining.location) + } + return .invalid } @@ -2007,43 +2016,49 @@ extension Source { /// /// ExpGroupStart -> '(_:' /// - mutating func lexAtom(context: ParsingContext) throws -> AST.Atom? { + mutating func lexAtom() -> AST.Atom? { let customCC = context.isInCustomCharacterClass - let kind: Located? = try recordLoc { src in + let kind = recordLoc { p -> AST.Atom.Kind? in // Check for not-an-atom, e.g. parser recursion termination - if src.isEmpty { return nil } - if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } + if p.src.isEmpty { return nil } + if !customCC && (p.peek() == ")" || p.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful // POSIX character property. Like \p{...} this is also allowed outside of // a custom character class. - if let prop = try src.lexPOSIXCharacterProperty()?.value { + if let prop = p.lexPOSIXCharacterProperty()?.value { return .property(prop) } // If we have group syntax that was skipped over in lexGroupStart, we - // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom(context: context) { - return try src.expectGroupLikeAtom(context: context) + // need to handle it as an atom, or diagnose an error. + if !customCC && p.shouldLexGroupLikeAtom() { + return p.expectGroupLikeAtom() } // A quantifier here is invalid. - if !customCC, - let q = try src.recordLoc({ try $0.lexQuantifier(context: context) }) { - throw ParseError.quantifierRequiresOperand( - String(src[q.location.range])) + if !customCC, let q = p.recordLoc({ $0.lexQuantifier() }) { + let str = String(p.src[q.location.range]) + p.error(.quantifierRequiresOperand(str), at: q.location) + return .invalid } - let char = src.eat() + guard let charLoc = p.tryEatWithLoc() else { + // We check at the beginning of the function for `isEmpty`, so we should + // not be at the end of the input here. + p.unreachable("Unexpected end of input") + return nil + } + let char = charLoc.value switch char { case ")", "|": if customCC { return .char(char) } - throw Unreachable("TODO: reason") + p.unreachable("Is as a termination condition") case "(" where !customCC: - throw Unreachable("Should have lexed a group or group-like atom") + p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .any @@ -2051,7 +2066,7 @@ extension Source { case "$": return customCC ? .char("$") : .endOfLine // Escaped - case "\\": return try src.expectEscaped(context: context).value + case "\\": return p.expectEscaped().value case "]": assert(!customCC, "parser should have prevented this") @@ -2065,7 +2080,7 @@ extension Source { let scalars = char.unicodeScalars if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" && !char.isLetter && !char.isNumber { - throw ParseError.confusableCharacter(char) + p.error(.confusableCharacter(char), at: charLoc.location) } break } @@ -2087,7 +2102,7 @@ extension Source { /// NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE' /// private mutating func lexNewlineSequenceMatchingOption( - ) throws -> AST.GlobalMatchingOption.NewlineSequenceMatching? { + ) -> AST.GlobalMatchingOption.NewlineSequenceMatching? { if tryEat(sequence: "BSR_ANYCRLF") { return .anyCarriageReturnOrLinefeed } if tryEat(sequence: "BSR_UNICODE") { return .anyUnicode } return nil @@ -2098,7 +2113,7 @@ extension Source { /// NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL' /// private mutating func lexNewlineMatchingOption( - ) throws -> AST.GlobalMatchingOption.NewlineMatching? { + ) -> AST.GlobalMatchingOption.NewlineMatching? { // The ordering here is important: CRLF needs to precede CR, and ANYCRLF // needs to precede ANY to ensure we don't short circuit on the wrong one. if tryEat(sequence: "CRLF") { return .carriageAndLinefeedOnly } @@ -2124,38 +2139,38 @@ extension Source { /// | 'LIMIT_MATCH' /// private mutating func lexGlobalMatchingOptionKind( - ) throws -> Located? { - try recordLoc { src in - if let opt = try src.lexNewlineSequenceMatchingOption() { + ) -> Located? { + recordLoc { p in + if let opt = p.lexNewlineSequenceMatchingOption() { return .newlineSequenceMatching(opt) } - if let opt = try src.lexNewlineMatchingOption() { + if let opt = p.lexNewlineMatchingOption() { return .newlineMatching(opt) } - if src.tryEat(sequence: "LIMIT_DEPTH") { - try src.expect("=") - return .limitDepth(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_DEPTH") { + p.expect("=") + return .limitDepth(p.expectNumber()) } - if src.tryEat(sequence: "LIMIT_HEAP") { - try src.expect("=") - return .limitHeap(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_HEAP") { + p.expect("=") + return .limitHeap(p.expectNumber()) } - if src.tryEat(sequence: "LIMIT_MATCH") { - try src.expect("=") - return .limitMatch(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_MATCH") { + p.expect("=") + return .limitMatch(p.expectNumber()) } // The ordering here is important: NOTEMPTY_ATSTART needs to precede // NOTEMPTY to ensure we don't short circuit on the wrong one. - if src.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart } - if src.tryEat(sequence: "NOTEMPTY") { return .notEmpty } - - if src.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess } - if src.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor } - if src.tryEat(sequence: "NO_JIT") { return .noJIT } - if src.tryEat(sequence: "NO_START_OPT") { return .noStartOpt } - if src.tryEat(sequence: "UTF") { return .utfMode } - if src.tryEat(sequence: "UCP") { return .unicodeProperties } + if p.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart } + if p.tryEat(sequence: "NOTEMPTY") { return .notEmpty } + + if p.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess } + if p.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor } + if p.tryEat(sequence: "NO_JIT") { return .noJIT } + if p.tryEat(sequence: "NO_START_OPT") { return .noStartOpt } + if p.tryEat(sequence: "UTF") { return .utfMode } + if p.tryEat(sequence: "UCP") { return .unicodeProperties } return nil } } @@ -2165,13 +2180,13 @@ extension Source { /// GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')' /// mutating func lexGlobalMatchingOption( - ) throws -> AST.GlobalMatchingOption? { - let kind = try recordLoc { src -> AST.GlobalMatchingOption.Kind? in - try src.tryEating { src in - guard src.tryEat(sequence: "(*"), - let kind = try src.lexGlobalMatchingOptionKind()?.value + ) -> AST.GlobalMatchingOption? { + let kind = recordLoc { p -> AST.GlobalMatchingOption.Kind? in + p.tryEating { p in + guard p.tryEat(sequence: "(*"), + let kind = p.lexGlobalMatchingOptionKind()?.value else { return nil } - try src.expect(")") + p.expect(")") return kind } } @@ -2184,9 +2199,9 @@ extension Source { /// GlobalMatchingOptionSequence -> GlobalMatchingOption+ /// mutating func lexGlobalMatchingOptionSequence( - ) throws -> AST.GlobalMatchingOptionSequence? { + ) -> AST.GlobalMatchingOptionSequence? { var opts: [AST.GlobalMatchingOption] = [] - while let opt = try lexGlobalMatchingOption() { + while let opt = lexGlobalMatchingOption() { opts.append(opt) } return .init(opts) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 05a618f59..0011390c7 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -107,12 +107,13 @@ struct ParsingContext { } } -private struct Parser { - var source: Source +struct Parser { + var src: Source var context: ParsingContext + var diags = Diagnostics() - init(_ source: Source, syntax: SyntaxOptions) { - self.source = source + init(_ src: Source, syntax: SyntaxOptions) { + self.src = src self.context = ParsingContext(syntax: syntax) } } @@ -128,10 +129,20 @@ extension ParsingContext { // Diagnostics extension Parser { - fileprivate func loc( + func loc( _ start: Source.Position ) -> SourceLocation { - SourceLocation(start ..< source.currentPosition) + SourceLocation(start ..< src.currentPosition) + } + + mutating func error(_ err: ParseError, at loc: SourceLocation) { + diags.error(err, at: loc) + } + mutating func errorAtCurrentPosition(_ err: ParseError) { + diags.error(err, at: loc(src.currentPosition)) + } + mutating func unreachable(_ err: String) { + diags.fatal(.unreachable(err), at: loc(src.currentPosition)) } } @@ -141,9 +152,9 @@ extension Parser { /// /// Regex -> GlobalMatchingOptionSequence? RegexNode /// - mutating func parse() throws -> AST { + mutating func parse() -> AST { // First parse any global matching options if present. - let opts = try source.lexGlobalMatchingOptionSequence() + let opts = lexGlobalMatchingOptionSequence() // If we have a newline mode global option, update the context accordingly. if let opts = opts { @@ -155,18 +166,19 @@ extension Parser { } // Then parse the root AST node. - let ast = try parseNode() - guard source.isEmpty else { + let ast = parseNode() + if !src.isEmpty { // parseConcatenation() terminates on encountering a ')' to enable // recursive parses of a group body. However for a top-level parse, this // means we have an unmatched closing paren, so let's diagnose. - if let loc = source.tryEatWithLoc(")") { - throw Source.LocatedError(ParseError.unbalancedEndOfGroup, loc) + // TODO: We should continue to parse for better recovery. + if let loc = tryEatWithLoc(")") { + error(.unbalancedEndOfGroup, at: loc) + } else { + unreachable("Unhandled termination condition") } - fatalError("Unhandled termination condition") } - // TODO: Record and store diagnostics on the AST. - return .init(ast, globalOptions: opts, diags: Diagnostics()) + return .init(ast, globalOptions: opts, diags: diags) } /// Parse a regular expression node. This should be used instead of `parse()` @@ -175,18 +187,18 @@ extension Parser { /// RegexNode -> '' | Alternation /// Alternation -> Concatenation ('|' Concatenation)* /// - mutating func parseNode() throws -> AST.Node { - let _start = source.currentPosition + mutating func parseNode() -> AST.Node { + let _start = src.currentPosition - if source.isEmpty { return .empty(.init(loc(_start))) } + if src.isEmpty { return .empty(.init(loc(_start))) } - var result = [try parseConcatenation()] + var result = [parseConcatenation()] var pipes: [SourceLocation] = [] while true { - let pipeStart = source.currentPosition - guard source.tryEat("|") else { break } + let pipeStart = src.currentPosition + guard tryEat("|") else { break } pipes.append(loc(pipeStart)) - result.append(try parseConcatenation()) + result.append(parseConcatenation()) } if result.count == 1 { @@ -202,40 +214,39 @@ extension Parser { /// ConcatComponent -> Trivia | Quote | Quantification /// Quantification -> QuantOperand Quantifier? /// - mutating func parseConcatenation() throws -> AST.Node { + mutating func parseConcatenation() -> AST.Node { var result = [AST.Node]() - let _start = source.currentPosition + let _start = src.currentPosition while true { // Check for termination, e.g. of recursion or bin ops - if source.isEmpty { break } - if source.peek() == "|" || source.peek() == ")" { break } + if src.isEmpty { break } + if peek() == "|" || peek() == ")" { break } // TODO: refactor loop body into function - let _start = source.currentPosition + let _start = src.currentPosition // Trivia -> `lexTrivia` - if let triv = try source.lexTrivia(context: context) { + if let triv = lexTrivia() { result.append(.trivia(triv)) continue } // Quote -> `lexQuote` - if let quote = try source.lexQuote(context: context) { + if let quote = lexQuote() { result.append(.quote(quote)) continue } // Interpolation -> `lexInterpolation` - if let interpolation = try source.lexInterpolation() { + if let interpolation = lexInterpolation() { result.append(.interpolation(interpolation)) continue } // Quantification -> QuantOperand Quantifier? - if let operand = try parseQuantifierOperand() { - if let (amt, kind, trivia) = - try source.lexQuantifier(context: context) { + if let operand = parseQuantifierOperand() { + if let (amt, kind, trivia) = lexQuantifier() { let location = loc(_start) result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) @@ -245,7 +256,8 @@ extension Parser { continue } - throw Unreachable("TODO: reason") + unreachable("Should have parsed at least an atom") + break } guard !result.isEmpty else { return .empty(.init(loc(_start))) @@ -260,30 +272,30 @@ extension Parser { /// Perform a recursive parse for the branches of a conditional. mutating func parseConditionalBranches( start: Source.Position, _ cond: AST.Conditional.Condition - ) throws -> AST.Node { - let child = try parseNode() + ) -> AST.Node { + let child = parseNode() let trueBranch: AST.Node, falseBranch: AST.Node, pipe: SourceLocation? switch child { case .alternation(let a): + pipe = a.pipes[0] + trueBranch = a.children[0] + falseBranch = a.children[1] + // If we have an alternation child, we only accept 2 branches. let numBranches = a.children.count guard numBranches == 2 else { - // TODO: Better API for the parser to throw located errors. - throw Source.LocatedError( - ParseError.tooManyBranchesInConditional(numBranches), child.location - ) + diags.error(.tooManyBranchesInConditional(numBranches), + at: child.location) + break } - trueBranch = a.children[0] - falseBranch = a.children[1] - pipe = a.pipes[0] default: // If there's no alternation, the child is assumed to be the true // branch, with the false branch matching anything. trueBranch = child - falseBranch = .empty(.init(loc(source.currentPosition))) + falseBranch = .empty(.init(loc(src.currentPosition))) pipe = nil } - try source.expect(")") + expect(")") return .conditional(.init( cond, trueBranch: trueBranch, pipe: pipe, falseBranch: falseBranch, loc(start))) @@ -293,7 +305,7 @@ extension Parser { /// current set of options. private mutating func applySyntaxOptions( of opts: AST.MatchingOptionSequence, isScoped: Bool - ) throws { + ) { func mapOption(_ option: SyntaxOptions, _ pred: (AST.MatchingOption) -> Bool) { if opts.resetsCurrentOptions { @@ -326,12 +338,9 @@ extension Parser { // An unscoped removal of extended syntax is not allowed in a multi-line // literal. if let opt = opts.removing.first(where: \.isAnyExtended) { - throw Source.LocatedError( - ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location) - } - if opts.resetsCurrentOptions { - throw Source.LocatedError( - ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!) + error(.cannotRemoveExtendedSyntaxInMultilineMode, at: opt.location) + } else if opts.resetsCurrentOptions { + error(.cannotResetExtendedSyntaxInMultilineMode, at: opts.caretLoc!) } // The only remaning case is an unscoped addition of extended syntax, // which is a no-op. @@ -346,36 +355,35 @@ extension Parser { /// current set of options. private mutating func applySyntaxOptions( of group: AST.Group.Kind, isScoped: Bool - ) throws { + ) { if case .changeMatchingOptions(let seq) = group { - try applySyntaxOptions(of: seq, isScoped: isScoped) + applySyntaxOptions(of: seq, isScoped: isScoped) } } /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located - ) throws -> AST.Group { + ) -> AST.Group { context.recordGroup(kind.value) let currentSyntax = context.syntax - try applySyntaxOptions(of: kind.value, isScoped: true) + applySyntaxOptions(of: kind.value, isScoped: true) defer { context.syntax = currentSyntax } let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) && !context.syntax.contains(.extendedSyntax) - let child = try parseNode() - try source.expect(")") + let child = parseNode() + expect(")") let groupLoc = loc(start) // In multi-line literals, the body of a group that unsets extended syntax // may not span multiple lines. if unsetsExtendedSyntax && context.syntax.contains(.multilineCompilerLiteral) && - source[child.location.range].spansMultipleLinesInRegexLiteral { - throw Source.LocatedError( - ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc) + src[child.location.range].spansMultipleLinesInRegexLiteral { + error(.unsetExtendedSyntaxMayNotSpanMultipleLines, at: groupLoc) } return .init(kind, child, groupLoc) } @@ -389,7 +397,7 @@ extension Parser { /// mutating func parseAbsentFunctionBody( _ start: AST.Located - ) throws -> AST.AbsentFunction { + ) -> AST.AbsentFunction { let startLoc = start.location // TODO: Diagnose on nested absent functions, which Oniguruma states is @@ -398,31 +406,31 @@ extension Parser { switch start.value { case .withoutPipe: // Must be a repeater. - kind = .repeater(try parseNode()) - case .withPipe where source.peek() == ")": + kind = .repeater(parseNode()) + case .withPipe where peek() == ")": kind = .clearer case .withPipe: // Can either be an expression or stopper depending on whether we have a // any additional '|'s. - let child = try parseNode() + let child = parseNode() switch child { case .alternation(let alt): // A pipe, so an expression. + kind = .expression( + absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1]) + let numChildren = alt.children.count guard numChildren == 2 else { - throw Source.LocatedError( - ParseError.tooManyAbsentExpressionChildren(numChildren), - child.location - ) + error(.tooManyAbsentExpressionChildren(numChildren), + at: child.location) + break } - kind = .expression( - absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1]) default: // No pipes, so a stopper. kind = .stopper(child) } } - try source.expect(")") + expect(")") return .init(kind, start: startLoc, location: loc(startLoc.start)) } @@ -434,44 +442,43 @@ extension Parser { /// Conditional -> CondStart Concatenation ('|' Concatenation)? ')' /// CondStart -> KnownCondStart | GroupCondStart /// - mutating func parseQuantifierOperand() throws -> AST.Node? { - assert(!source.isEmpty) + mutating func parseQuantifierOperand() -> AST.Node? { + assert(!src.isEmpty) - let _start = source.currentPosition + let _start = src.currentPosition // Check if we have the start of a conditional '(?(cond)', which can either // be a known condition, or an arbitrary group condition. - if let cond = try source.lexKnownConditionalStart(context: context) { - return try parseConditionalBranches(start: _start, cond) + if let cond = lexKnownConditionalStart() { + return parseConditionalBranches(start: _start, cond) } - if let kind = try source.lexGroupConditionalStart(context: context) { + if let kind = lexGroupConditionalStart() { let groupStart = kind.location.start - let group = try parseGroupBody(start: groupStart, kind) - return try parseConditionalBranches( + let group = parseGroupBody(start: groupStart, kind) + return parseConditionalBranches( start: _start, .init(.group(group), group.location)) } // Check if we have an Oniguruma absent function. - if let start = source.lexAbsentFunctionStart() { - return .absentFunction(try parseAbsentFunctionBody(start)) + if let start = lexAbsentFunctionStart() { + return .absentFunction(parseAbsentFunctionBody(start)) } // Check if we have the start of a group '('. - if let kind = try source.lexGroupStart(context: context) { - return .group(try parseGroupBody(start: _start, kind)) + if let kind = lexGroupStart() { + return .group(parseGroupBody(start: _start, kind)) } // Check if we have the start of a custom character class '['. - if let cccStart = source.lexCustomCCStart() { - return .customCharacterClass( - try parseCustomCharacterClass(cccStart)) + if let cccStart = lexCustomCCStart() { + return .customCharacterClass(parseCustomCharacterClass(cccStart)) } - if let atom = try source.lexAtom(context: context) { + if let atom = lexAtom() { // If we have a change matching options atom, apply the syntax options. We // already take care of scoping syntax options within a group. if case .changeMatchingOptions(let opts) = atom.kind { - try applySyntaxOptions(of: opts, isScoped: false) + applySyntaxOptions(of: opts, isScoped: false) } // TODO: track source locations return .atom(atom) @@ -496,19 +503,18 @@ extension Parser { /// mutating func parseCustomCharacterClass( _ start: Source.Located - ) throws -> CustomCC { + ) -> CustomCC { let alreadyInCCC = context.isInCustomCharacterClass context.isInCustomCharacterClass = true defer { context.isInCustomCharacterClass = alreadyInCCC } typealias Member = CustomCC.Member var members: Array = [] - try parseCCCMembers(into: &members) + parseCCCMembers(into: &members) // Make sure we have at least one semantic member. if members.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) + error(.expectedCustomCharacterClassMembers, at: start.location) } // If we have a binary set operator, parse it and the next members. Note @@ -516,40 +522,39 @@ extension Parser { // TODO: We may want to diagnose and require users to disambiguate, at least // for chains of separate operators. // TODO: What about precedence? - while let binOp = try source.lexCustomCCBinOp() { + while let binOp = lexCustomCCBinOp() { var rhs: Array = [] - try parseCCCMembers(into: &rhs) + parseCCCMembers(into: &rhs) if rhs.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) + error(.expectedCustomCharacterClassMembers, at: start.location) } members = [.setOperation(members, binOp, rhs)] } - try source.expect("]") + expect("]") return CustomCC(start, members, loc(start.location.start)) } - mutating func parseCCCMember() throws -> CustomCC.Member? { - guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil + mutating func parseCCCMember() -> CustomCC.Member? { + guard !src.isEmpty && peek() != "]" && peekCCBinOp() == nil else { return nil } // Nested custom character class. - if let cccStart = source.lexCustomCCStart() { - return .custom(try parseCustomCharacterClass(cccStart)) + if let cccStart = lexCustomCCStart() { + return .custom(parseCustomCharacterClass(cccStart)) } // Quoted sequence. - if let quote = try source.lexQuote(context: context) { + if let quote = lexQuote() { return .quote(quote) } // Lex triva if we're allowed. - if let trivia = try source.lexTrivia(context: context) { + if let trivia = lexTrivia() { return .trivia(trivia) } - if let atom = try source.lexAtom(context: context) { + if let atom = lexAtom() { return .atom(atom) } return nil @@ -557,9 +562,7 @@ extension Parser { /// Attempt to parse a custom character class range into `members`, or regular /// members if a range cannot be formed. - mutating func parsePotentialCCRange( - into members: inout [CustomCC.Member] - ) throws { + mutating func parsePotentialCCRange(into members: inout [CustomCC.Member]) { guard let lhs = members.last, lhs.isSemantic else { return } // Try and see if we can parse a character class range. Each time we parse @@ -567,23 +570,21 @@ extension Parser { // being a range, and we bail. If we succeed in parsing, we remove the // intermediate members. let membersBeforeRange = members.count - 1 - while let t = try source.lexTrivia(context: context) { + while let t = lexTrivia() { members.append(.trivia(t)) } - guard let dash = source.lexCustomCharacterClassRangeOperator() else { - return - } + guard let dash = lexCustomCharacterClassRangeOperator() else { return } // If we can't parse a range, '-' becomes literal, e.g `[6-]`. members.append(.atom(.init(.char("-"), dash))) - while let t = try source.lexTrivia(context: context) { + while let t = lexTrivia() { members.append(.trivia(t)) } - guard let rhs = try parseCCCMember() else { return } + guard let rhs = parseCCCMember() else { return } members.append(rhs) - func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom { + func makeOperand(_ m: CustomCC.Member, isLHS: Bool) -> AST.Atom? { switch m { case .atom(let a): return a @@ -591,25 +592,23 @@ extension Parser { // Not supported. While .NET allows `x-[...]` to spell subtraction, we // require `x--[...]`. We also ban `[...]-x` for consistency. if isLHS { - throw Source.LocatedError( - ParseError.invalidCharacterClassRangeOperand, m.location) + error(.invalidCharacterClassRangeOperand, at: m.location) } else { - throw Source.LocatedError( - ParseError.unsupportedDotNetSubtraction, m.location) + error(.unsupportedDotNetSubtraction, at: m.location) } case .quote: // Currently unsupported, we need to figure out what the semantics // would be for grapheme/scalar modes. - throw Source.LocatedError( - ParseError.unsupported("range with quoted sequence"), m.location) + error(.unsupported("range with quoted sequence"), at: m.location) case .trivia: - throw Unreachable("Should have been lexed separately") + unreachable("Should have been lexed separately") case .range, .setOperation: - throw Unreachable("Parsed later") + unreachable("Parsed later") } + return nil } - let lhsOp = try makeOperand(lhs, isLHS: true) - let rhsOp = try makeOperand(rhs, isLHS: false) + guard let lhsOp = makeOperand(lhs, isLHS: true), + let rhsOp = makeOperand(rhs, isLHS: false) else { return } // We've successfully parsed an atom LHS and RHS, so form a range, // collecting the trivia we've parsed, and replacing the members that @@ -622,20 +621,17 @@ extension Parser { // We need to specially check if we can lex a .NET character class // subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd // treat the second `-` as literal. - if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) { - throw Source.LocatedError( - ParseError.unsupportedDotNetSubtraction, dashLoc) + if let dashLoc = canLexDotNetCharClassSubtraction() { + error(.unsupportedDotNetSubtraction, at: dashLoc) } } - mutating func parseCCCMembers( - into members: inout Array - ) throws { + mutating func parseCCCMembers(into members: inout Array) { // Parse members and ranges until we see the end of the custom char class // or an operator. - while let member = try parseCCCMember() { + while let member = parseCCCMember() { members.append(member) - try parsePotentialCCRange(into: &members) + parsePotentialCCRange(into: &members) } } } @@ -651,20 +647,26 @@ public enum ASTStage { case semantic } -public func parse( - _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions -) throws -> AST where S.SubSequence == Substring +public func parseWithRecovery( + _ regex: S, _ syntax: SyntaxOptions, stage: ASTStage = .semantic +) -> AST where S.SubSequence == Substring { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - let ast = try parser.parse() + let ast = parser.parse() switch stage { case .syntactic: - break + return ast case .semantic: - try validate(ast) + return validate(ast) } - return ast +} + +public func parse( + _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions +) throws -> AST where S.SubSequence == Substring +{ + try parseWithRecovery(regex, syntax, stage: stage).ensureValid() } extension StringProtocol { @@ -694,15 +696,25 @@ fileprivate func defaultSyntaxOptions( } } +/// Parses a given regex string with delimiters, inferring the syntax options +/// from the delimiters used. +public func parseWithDelimitersWithRecovery( + _ regex: S +) -> AST where S.SubSequence == Substring { + let (contents, delim) = droppingRegexDelimiters(String(regex)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return parseWithRecovery(contents, syntax) +} + /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( _ regex: S, _ stage: ASTStage ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) + let syntax = defaultSyntaxOptions(delim, contents: contents) do { - let syntax = defaultSyntaxOptions(delim, contents: contents) - return try parse(contents, stage, syntax) + return try parseWithRecovery(contents, syntax, stage: stage).ensureValid() } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 269f0ee01..1b9da3e50 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -14,14 +14,18 @@ fileprivate struct RegexValidator { let ast: AST let captures: CaptureList + var diags = Diagnostics() init(_ ast: AST) { self.ast = ast self.captures = ast.captureList } - func error(_ kind: ParseError, at loc: SourceLocation) -> Error { - Source.LocatedError(kind, loc) + mutating func error(_ kind: ParseError, at loc: SourceLocation) { + diags.error(kind, at: loc) + } + mutating func unreachable(_ str: String, at loc: SourceLocation) { + diags.fatal(.unreachable(str), at: loc) } } @@ -30,89 +34,112 @@ extension String { } extension RegexValidator { - func validate() throws { + mutating func validate() -> AST { for opt in ast.globalOptions?.options ?? [] { - try validateGlobalMatchingOption(opt) + validateGlobalMatchingOption(opt) } - try validateCaptures() - try validateNode(ast.root) + validateCaptures() + validateNode(ast.root) + + var result = ast + result.diags.append(contentsOf: diags) + return result } - func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + /// Called when some piece of invalid AST is encountered. We want to ensure + /// an error was emitted. + mutating func expectInvalid(at loc: SourceLocation) { + guard ast.diags.hasAnyError else { + unreachable("Invalid, but no error emitted?", at: loc) + return + } + } + + mutating func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) { switch opt.kind { case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties: // These are PCRE specific, and not something we're likely to ever // support. - throw error(.unsupported("global matching option"), at: opt.location) + error(.unsupported("global matching option"), at: opt.location) case .newlineMatching: // We have implemented the correct behavior for multi-line literals, but // these should also affect '.' and '\N' matching, which we haven't // implemented. - throw error(.unsupported("newline matching mode"), at: opt.location) + error(.unsupported("newline matching mode"), at: opt.location) case .newlineSequenceMatching: // We haven't yet implemented the '\R' matching specifics of these. - throw error( - .unsupported("newline sequence matching mode"), at: opt.location) + error(.unsupported("newline sequence matching mode"), at: opt.location) } } - func validateCaptures() throws { + mutating func validateCaptures() { // TODO: Should this be validated when creating the capture list? var usedNames = Set() for capture in captures.captures { guard let name = capture.name else { continue } - guard usedNames.insert(name).inserted else { - throw error(.duplicateNamedCapture(name), at: capture.location) + if !usedNames.insert(name).inserted { + error(.duplicateNamedCapture(name), at: capture.location) } } } - func validateReference(_ ref: AST.Reference) throws { + mutating func validateReference(_ ref: AST.Reference) { if let recLevel = ref.recursionLevel { - throw error(.unsupported("recursion level"), at: recLevel.location) + error(.unsupported("recursion level"), at: recLevel.location) } switch ref.kind { case .absolute(let num): - guard let i = num.value else { break } - guard i < captures.captures.count else { - throw error(.invalidReference(i), at: ref.innerLoc) + guard let i = num.value else { + // Should have already been diagnosed. + expectInvalid(at: ref.innerLoc) + break + } + if i >= captures.captures.count { + error(.invalidReference(i), at: ref.innerLoc) } case .named(let name): - guard captures.hasCapture(named: name) else { - throw error(.invalidNamedReference(name), at: ref.innerLoc) + // An empty name is already invalid, so don't bother validating. + guard !name.isEmpty else { break } + if !captures.hasCapture(named: name) { + error(.invalidNamedReference(name), at: ref.innerLoc) + } + case .relative(let num): + guard let _ = num.value else { + // Should have already been diagnosed. + expectInvalid(at: ref.innerLoc) + break } - case .relative: - throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + error(.unsupported("relative capture reference"), at: ref.innerLoc) } } - func validateMatchingOption(_ opt: AST.MatchingOption) throws { + mutating func validateMatchingOption(_ opt: AST.MatchingOption) { let loc = opt.location switch opt.kind { case .allowDuplicateGroupNames: // Not currently supported as we need to figure out what to do with // the capture type. - throw error(.unsupported("duplicate group naming"), at: loc) + error(.unsupported("duplicate group naming"), at: loc) case .unicodeWordBoundaries: - throw error(.unsupported("unicode word boundary mode"), at: loc) + error(.unsupported("unicode word boundary mode"), at: loc) case .textSegmentWordMode, .textSegmentGraphemeMode: - throw error(.unsupported("text segment mode"), at: loc) + error(.unsupported("text segment mode"), at: loc) case .byteSemantics: - throw error(.unsupported("byte semantic mode"), at: loc) + error(.unsupported("byte semantic mode"), at: loc) case .unicodeScalarSemantics: - throw error(.unsupported("unicode scalar semantic mode"), at: loc) - + error(.unsupported("unicode scalar semantic mode"), at: loc) + case .graphemeClusterSemantics: - throw error(.unsupported("grapheme semantic mode"), at: loc) - + error(.unsupported("grapheme semantic mode"), at: loc) + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: @@ -120,18 +147,18 @@ extension RegexValidator { } } - func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + mutating func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) { for opt in opts.adding { - try validateMatchingOption(opt) + validateMatchingOption(opt) } for opt in opts.removing { - try validateMatchingOption(opt) + validateMatchingOption(opt) } } - func validateBinaryProperty( + mutating func validateBinaryProperty( _ prop: Unicode.BinaryProperty, at loc: SourceLocation - ) throws { + ) { switch prop { case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased, .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped, @@ -154,46 +181,49 @@ extension RegexValidator { break case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: - throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) case .compositionExclusion, .emojiComponent, .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, .otherUppercase, .prependedConcatenationMark: - throw error(.unsupported(prop.rawValue.quoted), at: loc) + error(.unsupported(prop.rawValue.quoted), at: loc) } } - func validateCharacterProperty( + mutating func validateCharacterProperty( _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation - ) throws { + ) { // TODO: We could re-add the .other case to diagnose unknown properties // here instead of in the parser. // TODO: Should we store an 'inner location' for the contents of `\p{...}`? switch prop.kind { case .binary(let b, _): - try validateBinaryProperty(b, at: loc) + validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc: break + case .invalid: + // Should have already been diagnosed. + expectInvalid(at: loc) case .pcreSpecial: - throw error(.unsupported("PCRE property"), at: loc) + error(.unsupported("PCRE property"), at: loc) case .block: - throw error(.unsupported("Unicode block property"), at: loc) + error(.unsupported("Unicode block property"), at: loc) case .javaSpecial: - throw error(.unsupported("Java property"), at: loc) + error(.unsupported("Java property"), at: loc) } } - func validateEscaped( + mutating func validateEscaped( _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation - ) throws { + ) { switch esc { case .resetStartOfMatch, .singleDataUnit, // '\N' needs to be emitted using 'emitAny'. .notNewline: - throw error(.unsupported("'\\\(esc.character)'"), at: loc) + error(.unsupported("'\\\(esc.character)'"), at: loc) // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, @@ -218,34 +248,34 @@ extension RegexValidator { } } - func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws { + mutating func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) { switch atom.kind { case .escaped(let esc): - try validateEscaped(esc, at: atom.location) + validateEscaped(esc, at: atom.location) case .keyboardControl, .keyboardMeta, .keyboardMetaControl: // We need to implement the scalar computations for these. - throw error(.unsupported("control sequence"), at: atom.location) + error(.unsupported("control sequence"), at: atom.location) case .property(let p): - try validateCharacterProperty(p, at: atom.location) + validateCharacterProperty(p, at: atom.location) case .backreference(let r): - try validateReference(r) + validateReference(r) case .subpattern: - throw error(.unsupported("subpattern"), at: atom.location) + error(.unsupported("subpattern"), at: atom.location) case .callout: // These are PCRE and Oniguruma specific, supporting them is future work. - throw error(.unsupported("callout"), at: atom.location) + error(.unsupported("callout"), at: atom.location) case .backtrackingDirective: // These are PCRE-specific, and are unlikely to be fully supported. - throw error(.unsupported("backtracking directive"), at: atom.location) + error(.unsupported("backtracking directive"), at: atom.location) case .changeMatchingOptions(let opts): - try validateMatchingOptions(opts) + validateMatchingOptions(opts) case .namedCharacter: // TODO: We should error on unknown Unicode scalar names. @@ -254,77 +284,89 @@ extension RegexValidator { case .scalarSequence: // Not currently supported in a custom character class. if inCustomCharacterClass { - throw error(.unsupported("scalar sequence in custom character class"), - at: atom.location) + error(.unsupported("scalar sequence in custom character class"), + at: atom.location) } case .char, .scalar, .startOfLine, .endOfLine, .any: break + + case .invalid: + // Should have already been diagnosed. + expectInvalid(at: atom.location) + break } } - func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + mutating func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) { for member in c.members { - try validateCharacterClassMember(member) + validateCharacterClassMember(member) } } - func validateCharacterClassRange( + mutating func validateCharacterClassRange( _ range: AST.CustomCharacterClass.Range - ) throws { + ) { let lhs = range.lhs let rhs = range.rhs - try validateAtom(lhs, inCustomCharacterClass: true) - try validateAtom(rhs, inCustomCharacterClass: true) + validateAtom(lhs, inCustomCharacterClass: true) + validateAtom(rhs, inCustomCharacterClass: true) guard lhs.isValidCharacterClassRangeBound else { - throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + error(.invalidCharacterClassRangeOperand, at: lhs.location) + return } guard rhs.isValidCharacterClassRangeBound else { - throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + error(.invalidCharacterClassRangeOperand, at: rhs.location) + return } guard let lhsChar = lhs.literalCharacterValue else { - throw error( + error( .unsupported("character class range operand"), at: lhs.location) + return } guard let rhsChar = rhs.literalCharacterValue else { - throw error( + error( .unsupported("character class range operand"), at: rhs.location) + return } - guard lhsChar <= rhsChar else { - throw error( + if lhsChar > rhsChar { + error( .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) } } - func validateCharacterClassMember( + mutating func validateCharacterClassMember( _ member: AST.CustomCharacterClass.Member - ) throws { + ) { switch member { case .custom(let c): - try validateCustomCharacterClass(c) + validateCustomCharacterClass(c) case .range(let r): - try validateCharacterClassRange(r) + validateCharacterClassRange(r) case .atom(let a): - try validateAtom(a, inCustomCharacterClass: true) + validateAtom(a, inCustomCharacterClass: true) case .setOperation(let lhs, _, let rhs): - for lh in lhs { try validateCharacterClassMember(lh) } - for rh in rhs { try validateCharacterClassMember(rh) } + for lh in lhs { validateCharacterClassMember(lh) } + for rh in rhs { validateCharacterClassMember(rh) } case .quote, .trivia: break } } - func validateGroup(_ group: AST.Group) throws { + mutating func validateGroup(_ group: AST.Group) { let kind = group.kind + if let name = kind.value.name, name.isEmpty { + expectInvalid(at: kind.location) + } switch kind.value { case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, .atomicNonCapturing: @@ -332,79 +374,83 @@ extension RegexValidator { case .balancedCapture: // These are .NET specific, and kinda niche. - throw error(.unsupported("balanced capture"), at: kind.location) + error(.unsupported("balanced capture"), at: kind.location) case .nonCaptureReset: // We need to figure out how these interact with typed captures. - throw error(.unsupported("branch reset group"), at: kind.location) + error(.unsupported("branch reset group"), at: kind.location) case .nonAtomicLookahead: - throw error(.unsupported("non-atomic lookahead"), at: kind.location) + error(.unsupported("non-atomic lookahead"), at: kind.location) case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: - throw error(.unsupported("lookbehind"), at: kind.location) + error(.unsupported("lookbehind"), at: kind.location) case .scriptRun, .atomicScriptRun: - throw error(.unsupported("script run"), at: kind.location) + error(.unsupported("script run"), at: kind.location) case .changeMatchingOptions(let opts): - try validateMatchingOptions(opts) + validateMatchingOptions(opts) } - try validateNode(group.child) + validateNode(group.child) } - func validateQuantification(_ quant: AST.Quantification) throws { - try validateNode(quant.child) - guard quant.child.isQuantifiable else { - throw error(.notQuantifiable, at: quant.child.location) + mutating func validateQuantification(_ quant: AST.Quantification) { + validateNode(quant.child) + if !quant.child.isQuantifiable { + error(.notQuantifiable, at: quant.child.location) } switch quant.amount.value { case .range(let lhs, let rhs): - guard let lhs = lhs.value, let rhs = rhs.value else { break } - guard lhs <= rhs else { - throw error(.invalidQuantifierRange(lhs, rhs), at: quant.location) + guard let lhs = lhs.value, let rhs = rhs.value else { + // Should have already been diagnosed. + expectInvalid(at: quant.location) + break + } + if lhs > rhs { + error(.invalidQuantifierRange(lhs, rhs), at: quant.location) } case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: break } } - func validateNode(_ node: AST.Node) throws { + mutating func validateNode(_ node: AST.Node) { switch node { case .alternation(let a): for branch in a.children { - try validateNode(branch) + validateNode(branch) } case .concatenation(let c): for child in c.children { - try validateNode(child) + validateNode(child) } case .group(let g): - try validateGroup(g) + validateGroup(g) case .conditional(let c): // Note even once we get runtime support for this, we need to change the // parsing to incorporate what is specified in the syntax proposal. - throw error(.unsupported("conditional"), at: c.location) + error(.unsupported("conditional"), at: c.location) case .quantification(let q): - try validateQuantification(q) + validateQuantification(q) case .atom(let a): - try validateAtom(a, inCustomCharacterClass: false) + validateAtom(a, inCustomCharacterClass: false) case .customCharacterClass(let c): - try validateCustomCharacterClass(c) + validateCustomCharacterClass(c) case .absentFunction(let a): // These are Oniguruma specific. - throw error(.unsupported("absent function"), at: a.location) + error(.unsupported("absent function"), at: a.location) case .interpolation(let i): // This is currently rejected in the parser for better diagnostics, but // reject here too until we get runtime support. - throw error(.unsupported("interpolation"), at: i.location) + error(.unsupported("interpolation"), at: i.location) case .quote, .trivia, .empty: break @@ -413,6 +459,7 @@ extension RegexValidator { } /// Check a regex AST for semantic validity. -public func validate(_ ast: AST) throws { - try RegexValidator(ast).validate() +public func validate(_ ast: AST) -> AST { + var validator = RegexValidator(ast) + return validator.validate() } diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift index 23cc0497d..22715ebc3 100644 --- a/Sources/_RegexParser/Regex/Parse/Source.swift +++ b/Sources/_RegexParser/Regex/Parse/Source.swift @@ -52,80 +52,20 @@ extension Source { func peek() -> Char? { _slice.first } - mutating func advance() { - assert(!isEmpty) - let newLower = _slice.index(after: bounds.lowerBound) - self.bounds = newLower ..< bounds.upperBound - } - - mutating func advance(_ i: Int) { - for _ in 0.. Bool { + guard n > 0, let newLower = _slice.index( + bounds.lowerBound, offsetBy: n, limitedBy: bounds.upperBound) + else { + return false } - } - - mutating func tryEat(_ c: Char) -> Bool { - guard peek() == c else { return false } - advance() - return true - } - - mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool { - guard let next = peek(), try pred(next) else { return false } - advance() - return true - } - - mutating func tryEat(sequence c: C) -> Bool - where C.Element == Char { - guard _slice.starts(with: c) else { return false } - advance(c.count) + self.bounds = newLower ..< bounds.upperBound return true } - mutating func tryEat(anyOf set: C) -> Char? - where C.Element == Char - { - guard let c = peek(), set.contains(c) else { return nil } - advance() - return c - } - mutating func tryEat(anyOf set: Char...) -> Char? { - tryEat(anyOf: set) - } - - /// Try to eat any character, returning `nil` if the input has been exhausted. - mutating func tryEat() -> Char? { - guard !isEmpty else { return nil } - return eat() - } - - mutating func eat(asserting c: Char) { - assert(peek() == c) - advance() - } - - mutating func eat() -> Char { - assert(!isEmpty) - defer { advance() } - return peek().unsafelyUnwrapped - } - - func starts( - with s: S - ) -> Bool where S.Element == Char { - _slice.starts(with: s) - } - - mutating func eat(upTo: Position) -> Input.SubSequence { - defer { - while _slice.startIndex != upTo { advance() } - } - return _slice[.. Input.SubSequence { let pre = _slice.prefix(count) - defer { advance(pre.count) } + tryAdvance(pre.count) return pre } @@ -134,10 +74,20 @@ extension Source { _ f: (Char) -> Bool ) -> Input.SubSequence? { guard let pre = peekPrefix(maxLength: maxLength, f) else { return nil } - defer { self.advance(pre.count) } + tryAdvance(pre.count) return pre } + mutating func tryEat(count: Int) -> Input.SubSequence? { + let pre = _slice.prefix(count) + guard tryAdvance(count) else { return nil } + return pre + } + + func starts(with s: S) -> Bool where S.Element == Char { + _slice.starts(with: s) + } + func peekPrefix( maxLength: Int? = nil, _ f: (Char) -> Bool @@ -153,11 +103,4 @@ extension Source { return pre } - - mutating func tryEat(count: Int) -> Input.SubSequence? { - let pre = _slice.prefix(count) - guard pre.count == count else { return nil } - defer { advance(count) } - return pre - } } diff --git a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift index eb51643bd..6f6928d2f 100644 --- a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift +++ b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift @@ -126,3 +126,13 @@ extension Source.LocatedError: CustomStringConvertible { return error } } + +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 640cf5559..48a2512cf 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -167,6 +167,9 @@ extension AST.Atom { case .changeMatchingOptions(let opts): return "changeMatchingOptions<\(opts)>" + case .invalid: + return "" + case .char, .scalar: fatalError("Unreachable") } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 1fa3514bb..af46b5381 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -272,7 +272,7 @@ extension AST.Atom { case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, - .backtrackingDirective, .changeMatchingOptions: + .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil } @@ -607,6 +607,9 @@ extension AST.Atom.CharacterProperty { case .javaSpecial(let s): throw Unsupported("TODO: map Java special: \(s)") + + case .invalid: + throw Unreachable("Expected valid property") } }() diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index a0cc11d01..b29053e14 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -931,6 +931,10 @@ extension AST.Atom { case .char, .scalar, .scalarSequence: return literalStringValue! + case .invalid: + // TODO: Can we recover the original regex text from the source range? + return "<#value#>" + case let .property(p): return p._regexBase diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index e28c72514..49184deb3 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -18,47 +18,49 @@ func diagnose( _ input: String, expecting expected: ParseError, _ syntax: SyntaxOptions = .traditional, - _ f: (inout Source) throws -> (), + _ f: (inout Parser) -> (), file: StaticString = #file, line: UInt = #line ) { - var src = Source(input) - do { - try f(&src) + var parser = Parser(Source(input), syntax: syntax) + f(&parser) + + let diags = parser.diags.diags + guard diags.count == 1 else { XCTFail(""" - Passed, but expected error: \(expected) + Expected single diagnostic """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { - XCTFail(""" - - Expected: \(expected) - Actual: \(e.error) - """, file: file, line: line) - return - } - } catch let e { - fatalError("Should be unreachable: \(e)") + return + } + + let error = diags[0].underlyingParseError! + guard error == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(error) + """, file: file, line: line) + return } } extension RegexTests { func testLexicalAnalysis() { - diagnose("a", expecting: .expected("b")) { src in - try src.expect("b") + diagnose("a", expecting: .expected("b")) { p in + p.expect("b") } - diagnose("", expecting: .unexpectedEndOfInput) { src in - try src.expectNonEmpty() + diagnose("", expecting: .unexpectedEndOfInput) { p in + p.expectNonEmpty() } - diagnose("a", expecting: .unexpectedEndOfInput) { src in - try src.expect("a") // Ok - try src.expectNonEmpty() // Error + diagnose("a", expecting: .unexpectedEndOfInput) { p in + p.expect("a") // Ok + p.expectNonEmpty() // Error } let bigNum = "12345678901234567890" - diagnose(bigNum, expecting: .numberOverflow(bigNum)) { src in - _ = try src.lexNumber() + diagnose(bigNum, expecting: .numberOverflow(bigNum)) { p in + _ = p.lexNumber() } // TODO: want to dummy print out source ranges, etc, test that. diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 608d55978..960a7214a 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,57 +33,38 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } -enum SemanticErrorKind { - case unsupported, invalid, unchecked -} - class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, - throwsError errorKind: SemanticErrorKind? = nil, - syntax: SyntaxOptions = .traditional, + throwsError expectedErrors: ParseError..., unsupported: Bool = false, + uncheckedErrors: Bool = false, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( input, .init(expectedAST, globalOptions: nil, diags: Diagnostics()), - throwsError: errorKind, syntax: syntax, captures: expectedCaptures, - file: file, line: line + throwsError: expectedErrors, unsupported: unsupported, + uncheckedErrors: uncheckedErrors, syntax: syntax, + captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, - throwsError errorKind: SemanticErrorKind? = nil, + throwsError expectedErrors: [ParseError] = [], unsupported: Bool = false, + uncheckedErrors: Bool = false, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast: AST - do { - ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) - } catch { - XCTFail("unexpected error: \(error)", file: file, line: line) - return - } - if let errorKind = errorKind, errorKind != .unchecked { - do { - _ = try parse(input, .semantic, syntax) - XCTFail("expected semantically invalid AST", file: file, line: line) - } catch let e as Source.LocatedError { - switch e.error { - case .unsupported: - XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) - default: - XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) - } - } catch { - XCTFail("Error without source location: \(error)", file: file, line: line) - } - } + let ast = parseWithRecovery(input, syntax) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, + unchecked: uncheckedErrors, file: file, line: line + ) guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -173,42 +154,25 @@ func delimiterLexingTest( /// not considered part of it. func parseWithDelimitersTest( _ input: String, _ expecting: AST.Node, - throwsError errorKind: SemanticErrorKind? = nil, - ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line + throwsError expectedErrors: ParseError..., unsupported: Bool = false, + uncheckedErrors: Bool = false, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let ast: AST.Node - do { - ast = try parseWithDelimiters( - literal, errorKind != nil ? .syntactic : .semantic).root - } catch { - XCTFail("unexpected error: \(error)", file: file, line: line) - return - } - if let errorKind = errorKind { - do { - _ = try parseWithDelimiters(input, .semantic) - XCTFail("expected semantically invalid AST", file: file, line: line) - } catch let e as Source.LocatedError { - switch e.error { - case .unsupported: - XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) - default: - XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) - } - } catch { - XCTFail("Error without source location: \(error)", file: file, line: line) - } - } - guard ast == expecting - || ast._dump() == expecting._dump() // EQ workaround + let ast = parseWithDelimitersWithRecovery(literal) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, + unchecked: uncheckedErrors, file: file, line: line + ) + guard ast.root == expecting + || ast.root._dump() == expecting._dump() // EQ workaround else { XCTFail(""" Expected: \(expecting._dump()) - Found: \(ast._dump()) + Found: \(ast.root._dump()) """, file: file, line: line) return @@ -221,8 +185,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, .syntactic, syntax) - let rhsAST = try! parse(rhs, .syntactic, syntax) + let lhsAST = parseWithRecovery(lhs, syntax) + let rhsAST = parseWithRecovery(rhs, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -238,7 +202,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, .syntactic, syntax).root + let ast = parseWithRecovery(input, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -252,60 +216,70 @@ func rangeTest( } } -func diagnosticTest( - _ input: String, _ expected: ParseError, - syntax: SyntaxOptions = .traditional, - file: StaticString = #file, line: UInt = #line +func matchDiagnostics( + _ expected: [ParseError], for ast: AST, unsupported: Bool, unchecked: Bool, + file: StaticString, line: UInt ) { - do { - let ast = try parse(input, .semantic, syntax) - XCTFail(""" + guard !unchecked else { return } + + var errors = Set() + for diag in ast.diags.diags where diag.isAnyError { + guard let underlying = diag.underlyingParseError else { + XCTFail( + "Unknown error emitted: '\(diag.message)'", file: file, line: line) + continue + } + // TODO: We should be uniquing based on source location, and failing if we + // emit duplicate diagnostics at the same location. + errors.insert(underlying) + } - Passed \(ast) - But expected error: \(expected) - """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { + // Filter out any unsupported errors if needed. + if unsupported { + errors = errors.filter { + if case .unsupported = $0 { return false } else { return true } + } + } + for mismatched in errors.symmetricDifference(expected) { + if errors.contains(mismatched) { + XCTFail(""" + Unexpected error: \(mismatched) + """, file: file, line: line) + } else { XCTFail(""" - Expected: \(expected) - Actual: \(e.error) + Expected error not emitted: \(mismatched) + for AST: \(ast) """, file: file, line: line) - return } - } catch let e { - XCTFail("Error without source location: \(e)", file: file, line: line) } } -func diagnosticWithDelimitersTest( - _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false, +func diagnosticTest( + _ input: String, _ expectedErrors: ParseError..., unsupported: Bool = false, + syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line +) { + let ast = parseWithRecovery(input, syntax) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, unchecked: false, + file: file, line: line + ) +} + +func diagnosticWithDelimitersTest( + _ input: String, _ expectedErrors: ParseError..., unsupported: Bool = false, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - do { - let orig = try parseWithDelimiters(literal, .semantic) - let ast = orig.root - XCTFail(""" - - Passed \(ast) - But expected error: \(expected) - """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { - XCTFail(""" - - Expected: \(expected) - Actual: \(e.error) - """, file: file, line: line) - return - } - } catch let e { - XCTFail("Error without source location: \(e)", file: file, line: line) - } + let ast = parseWithDelimitersWithRecovery(literal) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, unchecked: false, + file: file, line: line + ) } func delimiterLexingDiagnosticTest( @@ -345,6 +319,7 @@ func compilerInterfaceDiagnosticMessageTest( input, captureBufferOut: captureBuffer) XCTFail("Expected parse error", file: file, line: line) } catch let error as CompilerParseError { + XCTAssertNotNil(error.location, "Error without location", file: file, line: line) XCTAssertEqual(expectedErr, error.message, file: file, line: line) } catch { fatalError("Expected CompilerParseError") @@ -506,7 +481,7 @@ extension RegexTests { // FIXME: '\N' should be emitted through 'emitAny', not through the // _CharacterClassModel model. - parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + parseTest(#"\N"#, escaped(.notNewline), unsupported: true) parseTest(#"\R"#, escaped(.newlineSequence)) @@ -681,12 +656,12 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - ), throwsError: .unsupported) + ), unsupported: true) parseTest( #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), - throwsError: .unsupported) + unsupported: true) parseTest( #"[\u{AA}-\u{BB}]"#, @@ -697,17 +672,17 @@ extension RegexTests { parseTest( #"[\u{AA BB}-\u{CC}]"#, charClass(range_m(scalarSeq_a("\u{AA}", "\u{BB}"), scalar_a("\u{CC}"))), - throwsError: .unsupported + unsupported: true ) parseTest( #"[\u{CC}-\u{AA BB}]"#, charClass(range_m(scalar_a("\u{CC}"), scalarSeq_a("\u{AA}", "\u{BB}"))), - throwsError: .unsupported + unsupported: true ) parseTest( #"[\u{a b c}]"#, charClass(scalarSeq_m("\u{A}", "\u{B}", "\u{C}")), - throwsError: .unsupported + unsupported: true ) parseTest(#"(?x)[ a - b ]"#, concat( @@ -823,13 +798,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) - parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) - parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) - parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) - parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) + parseTest(#"\c "#, atom(.keyboardControl(" ")), unsupported: true) + parseTest(#"\c!"#, atom(.keyboardControl("!")), unsupported: true) + parseTest(#"\c~"#, atom(.keyboardControl("~")), unsupported: true) + parseTest(#"\C--"#, atom(.keyboardControl("-")), unsupported: true) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), unsupported: true) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), unsupported: true) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), unsupported: true) // MARK: Comments @@ -934,11 +909,11 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - throwsError: .unsupported, captures: [.named("a")]) + unsupported: true, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - throwsError: .unsupported, captures: [.cap]) + unsupported: true, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - throwsError: .unsupported, captures: [.named("a")]) + unsupported: true, captures: [.named("a")]) // Capture resets. // FIXME: The captures in each branch should be unified. For now, we don't @@ -946,29 +921,30 @@ extension RegexTests { parseTest( "(?|(a)|(b))", nonCaptureReset(alt(capture("a"), capture("b"))), - throwsError: .unsupported, captures: [.opt, .opt] + unsupported: true, captures: [.opt, .opt] ) parseTest( "(?|(?a)|(b))", nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + unsupported: true, captures: [.named("x", opt: 1), .opt] ) parseTest( "(?|(a)|(?b))", nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), - throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + unsupported: true, captures: [.opt, .named("x", opt: 1)] ) parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), - throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] + throwsError: .duplicateNamedCapture("x"), unsupported: true, + captures: [.named("x", opt: 1), .named("x", opt: 1)] ) // TODO: Reject mismatched names? parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + unsupported: true, captures: [.named("x", opt: 1), .named("y", opt: 1)] ) // Other groups @@ -977,7 +953,7 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) + concat("a", nonCaptureReset("b"), "c"), unsupported: true) parseTest( #"a(?>b)c"#, concat("a", atomicNonCapturing("b"), "c")) @@ -995,41 +971,41 @@ extension RegexTests { concat("a", negativeLookahead("b"), "c")) parseTest("a(?<=b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(*plb:b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(*positive_lookbehind:b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(?"#, backreference(ref(plus: 4)), throwsError: .unsupported) - parseTest(#"\k<2>"#, backreference(ref(2)), throwsError: .invalid) - parseTest(#"\k'-3'"#, backreference(ref(minus: 3)), throwsError: .unsupported) - parseTest(#"\k'1'"#, backreference(ref(1)), throwsError: .invalid) + parseTest(#"\113"#, backreference(ref(113)), throwsError: .invalidReference(113)) + parseTest(#"\377"#, backreference(ref(377)), throwsError: .invalidReference(377)) + parseTest(#"\81"#, backreference(ref(81)), throwsError: .invalidReference(81)) + + parseTest(#"\g1"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g001"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g52"#, backreference(ref(52)), throwsError: .invalidReference(52)) + parseTest(#"\g-01"#, backreference(ref(minus: 1)), unsupported: true) + parseTest(#"\g+30"#, backreference(ref(plus: 30)), unsupported: true) + + parseTest(#"\g{1}"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g{001}"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g{52}"#, backreference(ref(52)), throwsError: .invalidReference(52)) + parseTest(#"\g{-01}"#, backreference(ref(minus: 1)), unsupported: true) + parseTest(#"\g{+30}"#, backreference(ref(plus: 30)), unsupported: true) + parseTest(#"\k<+4>"#, backreference(ref(plus: 4)), unsupported: true) + parseTest(#"\k<2>"#, backreference(ref(2)), throwsError: .invalidReference(2)) + parseTest(#"\k'-3'"#, backreference(ref(minus: 3)), unsupported: true) + parseTest(#"\k'1'"#, backreference(ref(1)), throwsError: .invalidReference(1)) parseTest( #"(?)\k"#, concat( @@ -1308,39 +1284,47 @@ extension RegexTests { ), captures: [.named("a")] ) - parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid) - parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalid) - parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid) - parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid) + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalidNamedReference("a0")) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalidNamedReference("bc")) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalidNamedReference("abc")) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalidNamedReference("abc")) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k<1+1>"#, backreference(ref(1), recursionLevel: 1), throwsError: .unsupported) - parseTest(#"\k<3-8>"#, backreference(ref(3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'-3-8'"#, backreference(ref(minus: 3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3-8'"#, backreference(ref(plus: 3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3+8'"#, backreference(ref(plus: 3), recursionLevel: 8), throwsError: .unsupported) - - parseTest(#"(?R)"#, subpattern(ref(0)), throwsError: .unsupported) - parseTest(#"(?0)"#, subpattern(ref(0)), throwsError: .unsupported) - parseTest(#"(?1)"#, subpattern(ref(1)), throwsError: .unsupported) - parseTest(#"(?+12)"#, subpattern(ref(plus: 12)), throwsError: .unsupported) - parseTest(#"(?-2)"#, subpattern(ref(minus: 2)), throwsError: .unsupported) - parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) - parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), + throwsError: .invalidNamedReference("bc"), unsupported: true) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), + throwsError: .invalidNamedReference("a"), unsupported: true) + parseTest(#"\k<1+1>"#, backreference(ref(1), recursionLevel: 1), + throwsError: .invalidReference(1), unsupported: true) + parseTest(#"\k<3-8>"#, backreference(ref(3), recursionLevel: -8), + throwsError: .invalidReference(3), unsupported: true) + parseTest(#"\k'-3-8'"#, backreference(ref(minus: 3), recursionLevel: -8), + unsupported: true) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), + throwsError: .invalidNamedReference("bc"), unsupported: true) + parseTest(#"\k'+3-8'"#, backreference(ref(plus: 3), recursionLevel: -8), + unsupported: true) + parseTest(#"\k'+3+8'"#, backreference(ref(plus: 3), recursionLevel: 8), + unsupported: true) + + parseTest(#"(?R)"#, subpattern(ref(0)), unsupported: true) + parseTest(#"(?0)"#, subpattern(ref(0)), unsupported: true) + parseTest(#"(?1)"#, subpattern(ref(1)), unsupported: true) + parseTest(#"(?+12)"#, subpattern(ref(plus: 12)), unsupported: true) + parseTest(#"(?-2)"#, subpattern(ref(minus: 2)), unsupported: true) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), unsupported: true) + parseTest(#"(?P>P)"#, subpattern(.named("P")), unsupported: true) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(ref(1)), throwsError: .unsupported) - parseTest(#"\g<001>"#, subpattern(ref(1)), throwsError: .unsupported) - parseTest(#"\g'52'"#, subpattern(ref(52)), throwsError: .unsupported) - parseTest(#"\g'-01'"#, subpattern(ref(minus: 1)), throwsError: .unsupported) - parseTest(#"\g'+30'"#, subpattern(ref(plus: 30)), throwsError: .unsupported) - parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) + parseTest(#"\g<1>"#, subpattern(ref(1)), unsupported: true) + parseTest(#"\g<001>"#, subpattern(ref(1)), unsupported: true) + parseTest(#"\g'52'"#, subpattern(ref(52)), unsupported: true) + parseTest(#"\g'-01'"#, subpattern(ref(minus: 1)), unsupported: true) + parseTest(#"\g'+30'"#, subpattern(ref(plus: 30)), unsupported: true) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), unsupported: true) // These are valid references. parseTest(#"()\1"#, concat( @@ -1363,7 +1347,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), unsupported: true ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1429,12 +1413,12 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), throwsError: .unsupported) + parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), unsupported: true) parseTest(#"\p{Hebrew}"#, prop(.scriptExtension(.hebrew))) parseTest(#"\p{Is_Hebrew}"#, prop(.scriptExtension(.hebrew))) - parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) - parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), unsupported: true) + parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), unsupported: true) // These are the shorthand properties with an "in" prefix we currently // recognize. Make sure they don't clash with block properties. @@ -1457,38 +1441,38 @@ extension RegexTests { parseTest(#"\p{is\#(p.rawValue)}"#, prop(.posix(p))) } for b in Unicode.BinaryProperty.allCases { - // Some of these are unsupported, so don't check for semantic errors. - parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) - parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + // Some of these are unsupported, so don't check for errors. + parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), uncheckedErrors: true) + parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), uncheckedErrors: true) } for j in AST.Atom.CharacterProperty.JavaSpecial.allCases { - parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported) + parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), unsupported: true) } // Try prefixing each block property with "in" to make sure we don't stomp // on any other property shorthands. for b in Unicode.Block.allCases { - parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), throwsError: .unsupported) + parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), unsupported: true) } parseTest(#"\p{ASCII}"#, prop(.ascii)) parseTest(#"\p{isASCII}"#, prop(.ascii)) - parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), unsupported: true) - parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) parseTest(#"\p{isAny}"#, prop(.any)) parseTest(#"\p{isAssigned}"#, prop(.assigned)) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), unsupported: true) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), unsupported: true) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), unsupported: true) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), unsupported: true) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), unsupported: true) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1508,45 +1492,45 @@ extension RegexTests { // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), unsupported: true) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), unsupported: true) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), unsupported: true) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), throwsError: .unsupported, captures: [.opt]) + ), unsupported: true, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), unsupported: true) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest( #"(?)(?(a+5))"#, @@ -1554,7 +1538,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - throwsError: .unsupported, captures: [.named("a")] + unsupported: true, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1562,50 +1546,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - throwsError: .unsupported, captures: [.named("a1")] + unsupported: true, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), unsupported: true) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .recursionCheck, trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported, captures: [.cap]) + ), unsupported: true, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), throwsError: .unsupported, captures: [.named("xxx")]) + ), unsupported: true, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1635,119 +1619,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a"), throwsError: .unsupported) + falseBranch: "a"), unsupported: true) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .defineGroup, trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(number: 0), throwsError: .unsupported) - parseTest(#"(?C0)"#, pcreCallout(number: 0), throwsError: .unsupported) - parseTest(#"(?C20)"#, pcreCallout(number: 20), throwsError: .unsupported) - parseTest("(?C{abc})", pcreCallout(string: "abc"), throwsError: .unsupported) + parseTest(#"(?C)"#, pcreCallout(number: 0), unsupported: true) + parseTest(#"(?C0)"#, pcreCallout(number: 0), unsupported: true) + parseTest(#"(?C20)"#, pcreCallout(number: 20), unsupported: true) + parseTest("(?C{abc})", pcreCallout(string: "abc"), unsupported: true) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { parseTest("(?C\(delim)hello\(delim))", pcreCallout(string: "hello"), - throwsError: .unsupported) + unsupported: true) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) + parseTest("(*X)", onigurumaNamedCallout("X"), unsupported: true) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), unsupported: true) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), unsupported: true) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), unsupported: true) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), unsupported: true) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), unsupported: true) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), unsupported: true) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), unsupported: true) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), unsupported: true) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), unsupported: true) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), unsupported: true) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), unsupported: true) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), unsupported: true) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), unsupported: true) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), unsupported: true) parseTest( "(*ACCEPT:a)??", zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), - throwsError: .unsupported + unsupported: true ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) - parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) - parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) - parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) - parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) - parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), unsupported: true) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), unsupported: true) + parseTest("(*F)", backtrackingDirective(.fail), unsupported: true) + parseTest("(*COMMIT)", backtrackingDirective(.commit), unsupported: true) + parseTest("(*SKIP)", backtrackingDirective(.skip), unsupported: true) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), unsupported: true) + parseTest("(*PRUNE)", backtrackingDirective(.prune), unsupported: true) + parseTest("(*THEN)", backtrackingDirective(.then), unsupported: true) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) - parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) - parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) - parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) - - parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) - parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) + parseTest("(?~)", absentRepeater(empty()), unsupported: true) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), unsupported: true) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), unsupported: true) + parseTest("(?~~)", absentRepeater("~"), unsupported: true) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), unsupported: true) + parseTest("(?~(a))", absentRepeater(capture("a")), unsupported: true, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), unsupported: true) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), unsupported: true) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), unsupported: true) + parseTest("(?~|~)", absentStopper("~"), unsupported: true) + parseTest("(?~|(a))", absentStopper(capture("a")), unsupported: true, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), unsupported: true) + + parseTest("(?~|a|b)", absentExpression("a", "b"), unsupported: true) + parseTest("(?~|~|~)", absentExpression("~", "~"), unsupported: true) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - throwsError: .unsupported, captures: []) + unsupported: true, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), throwsError: .unsupported, captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) + ), unsupported: true, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), unsupported: true) - parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) + parseTest("(?~|)", absentRangeClear(), unsupported: true) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), unsupported: true) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, .limitDepth(.init(3, at: .fake)) - ), throwsError: .unsupported) + ), unsupported: true) parseTest( "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), - throwsError: .unsupported) + unsupported: true) parseTest( "(*BSR_ANYCRLF)", ast( empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported) + unsupported: true) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1755,7 +1739,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) }), throwsError: .unsupported) + ].map { .newlineMatching($0) }), unsupported: true) parseTest( """ @@ -1768,7 +1752,7 @@ extension RegexTests { .limitMatch(.init(2, at: .fake)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ), throwsError: .unsupported + ), unsupported: true ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1988,7 +1972,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -1999,7 +1983,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2010,7 +1994,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2021,7 +2005,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2032,7 +2016,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2043,7 +2027,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2054,7 +2038,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2065,7 +2049,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2076,7 +2060,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2097,7 +2081,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2108,7 +2092,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2119,7 +2103,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2133,7 +2117,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseWithDelimitersTest( @@ -2306,7 +2290,7 @@ extension RegexTests { parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), - throwsError: .unsupported) + unsupported: true) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) @@ -2314,28 +2298,28 @@ extension RegexTests { parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), - throwsError: .unsupported + unsupported: true ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), - throwsError: .unsupported + unsupported: true ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A")) parseWithDelimitersTest( #"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1), - throwsError: .unsupported + unsupported: true ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), throwsError: .unsupported) + #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true) parseWithDelimitersTest( #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#), - throwsError: .unsupported) + unsupported: true) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -2625,6 +2609,157 @@ extension RegexTests { rangeTest("(?~|a|b)", entireRange) } + func testParseRecovery() { + // MARK: Groups + + parseTest( + "(", capture(empty()), + throwsError: .expected(")"), captures: [.cap] + ) + parseTest( + "(abc", capture(concat("a", "b", "c")), + throwsError: .expected(")"), captures: [.cap] + ) + parseTest("(?", nonCapture(empty()), throwsError: .expectedGroupSpecifier, .expected(")")) + parseTest("(?:", nonCapture(empty()), throwsError: .expected(")")) + + parseTest( + "(?<", namedCapture("", empty()), + throwsError: .expectedIdentifier(.groupName), .expected(">"), .expected(")"), + captures: [.named("")] + ) + parseTest( + "(?"), .expected(")"), + captures: [.named("a")] + ) + + // MARK: Character classes + + parseTest("[", charClass(), throwsError: .expectedCustomCharacterClassMembers, .expected("]")) + parseTest("[^", charClass(inverted: true), throwsError: .expectedCustomCharacterClassMembers, .expected("]")) + parseTest("[a", charClass("a"), throwsError: .expected("]")) + + parseTest( + "[a&&", charClass(setOp("a", op: .intersection)), + throwsError: .expectedCustomCharacterClassMembers, .expected("]") + ) + parseTest( + "[a&&b", charClass(setOp("a", op: .intersection, "b")), + throwsError: .expected("]") + ) + + diagnosticTest("[:a", .expected("]")) + diagnosticTest("[:a:", .expected("]")) + diagnosticTest("[[:a", .expected("]")) + diagnosticTest("[[:a:", .expected("]")) + diagnosticTest("[[:a[:]", .expected("]")) + + diagnosticTest("[::]", .emptyProperty) + diagnosticTest("[:=:]", .emptyProperty) + diagnosticTest("[[::]]", .emptyProperty) + diagnosticTest("[[:=:]]", .emptyProperty) + + // MARK: Unicode Scalars + + parseTest(#"\u{"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\u{ "#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\u{5"#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\x{5"#, scalar("\u{5}"), throwsError: .expected("}")) + + parseTest(#"\u{ 5"#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 "#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 6"#, scalarSeq("\u{5}", "\u{6}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 6 "#, scalarSeq("\u{5}", "\u{6}"), throwsError: .expected("}")) + + parseTest(#"\x{"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + + parseTest(#"\u{ A H }"#, scalarSeq("\u{A}", "\u{0}"), throwsError: .expectedNumber("H", kind: .hex)) + + parseTest(#"\uABC"#, scalar("\u{ABC}"), throwsError: .expectedNumDigits("ABC", 4)) + + // MARK: Named characters + + parseTest(#"\N{"#, atom(.namedCharacter("")), throwsError: .expected("}")) + parseTest(#"\N{a"#, atom(.namedCharacter("a")), throwsError: .expected("}")) + parseTest(#"\N{U"#, atom(.namedCharacter("U")), throwsError: .expected("}")) + parseTest(#"\N{U+"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\N{U+A"#, scalar("\u{A}"), throwsError: .expected("}")) + parseTest(#"\N{U+}"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex)) + + // MARK: Character properties + + parseTest( + #"\p{"#, prop(.invalid(key: nil, value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{a"#, prop(.invalid(key: nil, value: "a")), + throwsError: .unknownProperty(key: nil, value: "a"), .expected("}") + ) + parseTest( + #"\p{a="#, prop(.invalid(key: "a", value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{a=b"#, prop(.invalid(key: "a", value: "b")), + throwsError: .unknownProperty(key: "a", value: "b"), .expected("}") + ) + parseTest( + #"\p{sc"#, prop(.generalCategory(.currencySymbol)), + throwsError: .expected("}") + ) + parseTest( + #"\p{sc="#, prop(.invalid(key: "sc", value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{sc=a"#, prop(.invalid(key: "sc", value: "a")), + throwsError: .unrecognizedScript("a"), .expected("}") + ) + + // MARK: Matching options + + parseTest( + #"(?^"#, changeMatchingOptions(unsetMatchingOptions(), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?x"#, changeMatchingOptions(matchingOptions(adding: .extended), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?xi"#, changeMatchingOptions(matchingOptions(adding: .extended, .caseInsensitive), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?xi-"#, changeMatchingOptions( + matchingOptions(adding: .extended, .caseInsensitive), empty() + ), + throwsError: .expected(")") + ) + parseTest( + #"(?xi-n"#, changeMatchingOptions( + matchingOptions(adding: .extended, .caseInsensitive, removing: .namedCapturesOnly), + empty() + ), + throwsError: .expected(")") + ) + parseTest( + #"(?xz"#, changeMatchingOptions(matchingOptions(adding: .extended), "z"), + throwsError: .invalidMatchingOption("z"), .expected(")") + ) + parseTest( + #"(?x:"#, changeMatchingOptions(matchingOptions(adding: .extended), empty()), + throwsError: .expected(")") + ) + + // MARK: Invalid values + + parseTest("a{9999999999999999999999999999}", exactly(nil, of: "a"), + throwsError: .numberOverflow("9999999999999999999999999999")) + } + func testParseErrors() { // MARK: Unbalanced delimiters. @@ -2633,19 +2768,19 @@ extension RegexTests { diagnosticTest(")))", .unbalancedEndOfGroup) diagnosticTest("())()", .unbalancedEndOfGroup) - diagnosticTest("[", .expectedCustomCharacterClassMembers) - diagnosticTest("[^", .expectedCustomCharacterClassMembers) + diagnosticTest("[", .expectedCustomCharacterClassMembers, .expected("]")) + diagnosticTest("[^", .expectedCustomCharacterClassMembers, .expected("]")) diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) - diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a")) - diagnosticTest(#"\p{a="#, .emptyProperty) + diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a"), .expected("}")) + diagnosticTest(#"\p{a="#, .emptyProperty, .expected("}")) diagnosticTest(#"\p{a=}"#, .emptyProperty) - diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) - diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) - diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"), .expected("}")) + diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"), .expected("}")) + diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"), .expected("}")) diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category")) @@ -2663,30 +2798,31 @@ extension RegexTests { diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) - diagnosticTest(#"(?"#, .expectedGroupSpecifier) + diagnosticTest(#"(?"#, .expectedGroupSpecifier, .expected(")")) diagnosticTest(#"(?^"#, .expected(")")) diagnosticTest(#"(?^i"#, .expected(")")) - diagnosticTest(#"(?y)"#, .expected("{")) - diagnosticTest(#"(?y{)"#, .expected("g")) - diagnosticTest(#"(?y{g)"#, .expected("}")) - diagnosticTest(#"(?y{x})"#, .expected("g")) + // TODO: These errors could be better. + diagnosticTest(#"(?y)"#, .expected("{"), .expected("g"), .expected("}"), unsupported: true) + diagnosticTest(#"(?y{)"#, .expected("g"), .expected("}"), unsupported: true) + diagnosticTest(#"(?y{g)"#, .expected("}"), unsupported: true) + diagnosticTest(#"(?y{x})"#, .expected("g"), .expected("}"), .invalidMatchingOption("}"), unsupported: true) diagnosticTest(#"(?P"#, .expected(")")) - diagnosticTest(#"(?R"#, .expected(")")) + diagnosticTest(#"(?R"#, .expected(")"), unsupported: true) diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) - diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) + diagnosticTest("\"ab\\", .expectedEscape, .expected("\""), syntax: .experimental) - diagnosticTest("(?C", .expected(")")) + diagnosticTest("(?C", .expected(")"), unsupported: true) - diagnosticTest("(?<", .expectedIdentifier(.groupName)) - diagnosticTest("(?")) - diagnosticTest("(?")) - diagnosticTest("(?", .expected(")")) + diagnosticTest("(?<", .expectedIdentifier(.groupName), .expected(">"), .expected(")")) + diagnosticTest("(?"), .expected(")")) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?", .expected(")"), unsupported: true) // MARK: Character classes @@ -2727,17 +2863,17 @@ extension RegexTests { diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction) diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction) - diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction, .invalidCharacterClassRangeOperand) diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction) diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand) diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction) - diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers, .unsupportedDotNetSubtraction) diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers) diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers) - diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers, .unsupportedDotNetSubtraction) diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand) diagnosticTest("[--]", .expectedCustomCharacterClassMembers) @@ -2756,8 +2892,8 @@ extension RegexTests { diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) - diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers) - diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers, .expected("]")) + diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers, .expected("]")) // MARK: Bad escapes @@ -2806,7 +2942,7 @@ extension RegexTests { // MARK: Confusable characters diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}")) - diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}")) + diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}"), .unbalancedEndOfGroup) diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}")) diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#)) diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}")) @@ -2822,7 +2958,7 @@ extension RegexTests { diagnosticTest("<{)}>", .unsupported("interpolation")) diagnosticTest("<{}}>", .unsupported("interpolation")) diagnosticTest("<{<{}>", .unsupported("interpolation")) - diagnosticTest("(<{)}>", .unsupported("interpolation")) + diagnosticTest("(<{)}>", .expected(")"), .unsupported("interpolation")) // MARK: Character properties @@ -2831,7 +2967,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa"), .expected("}"), .unknownProperty(key: nil, value: "b")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) diagnosticTest(#"\p{Basic_Latin}"#, .unknownProperty(key: nil, value: "Basic_Latin")) @@ -2843,16 +2979,15 @@ extension RegexTests { // MARK: Matching options - diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) - diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) + diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions, unsupported: true) + diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions, unsupported: true) - // FIXME: Reenable once we figure out (?X) and (?u) semantics - //diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) - //diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) - diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + // FIXME: We need to figure out (?X) and (?u) semantics + diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions, unsupported: true) + diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions, unsupported: true) + diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions, unsupported: true) diagnosticTest("(?a)", .unknownGroupKind("?a")) - diagnosticTest("(?y{)", .expected("g")) // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" @@ -2946,23 +3081,24 @@ extension RegexTests { // MARK: Group specifiers - diagnosticTest(#"(*"#, .unknownGroupKind("*")) + diagnosticTest(#"(*"#, .expectedIdentifier(.onigurumaCalloutName), .expected(")"), unsupported: true) diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) - diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) + // TODO: We shouldn't emit the expected closing delimiter here and elsewhere. + diagnosticTest(#"(?<#>)"#, .expected(">"), .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) // TODO: It might be better if tried to consume up to the closing `'` and // diagnosed an invalid group name based on that. diagnosticTest(#"(?'abc ')"#, .expected("'")) - diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName), .expected("'")) - diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?'a-b-c')"#, .expected("'")) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName), unsupported: true) + diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName), .expected("'"), unsupported: true) + diagnosticTest(#"(?'a-b-c')"#, .expected("'"), unsupported: true) diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) @@ -2996,7 +3132,7 @@ extension RegexTests { diagnosticTest(#"$?"#, .notQuantifiable) diagnosticTest(#"(?=a)+"#, .notQuantifiable) diagnosticTest(#"(?i)*"#, .notQuantifiable) - diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#), .notQuantifiable) diagnosticTest(#"\y{2,5}"#, .notQuantifiable) diagnosticTest(#"\Y{3,}"#, .notQuantifiable) @@ -3004,8 +3140,8 @@ extension RegexTests { diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) - diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex)) - diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex), .expected("}")) + diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex), .expected("}")) diagnosticTest(#"\u{}"#, .expectedNumber("", kind: .hex)) diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) @@ -3013,7 +3149,7 @@ extension RegexTests { diagnosticTest(#"\u{G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ GH }"#, .expectedNumber("GH", kind: .hex)) - diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex), .expectedNumber("H", kind: .hex)) diagnosticTest(#"\u{ ABC G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ FFFFFFFFF A }"#, .numberOverflow("FFFFFFFFF")) @@ -3027,38 +3163,38 @@ extension RegexTests { // MARK: Matching options - diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret, .expected(")")) diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) - diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret, .expected(")")) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References diagnosticTest(#"\k''"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?&)"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?P>)"#, .expectedIdentifier(.groupName)) + diagnosticTest(#"(?&)"#, .expectedIdentifier(.groupName), unsupported: true) + diagnosticTest(#"(?P>)"#, .expectedIdentifier(.groupName), unsupported: true) diagnosticTest(#"\g{0}"#, .cannotReferToWholePattern) - diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern) + diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern, unsupported: true) - diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) + diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) + diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) + diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) + diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName), .expected("'"), unsupported: true) + diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) - diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName)) - diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName)) + diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName), unsupported: true) + diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName), .invalidNamedReference("1")) - diagnosticTest(#"\g<1-1>"#, .expected(">")) - diagnosticTest(#"\g{1-1}"#, .expected("}")) - diagnosticTest(#"\k{a-1}"#, .expected("}")) - diagnosticTest(#"\k{a-}"#, .expected("}")) + diagnosticTest(#"\g<1-1>"#, .expected(">"), unsupported: true) + diagnosticTest(#"\g{1-1}"#, .expected("}"), .invalidReference(1)) + diagnosticTest(#"\k{a-1}"#, .expected("}"), .invalidNamedReference("a")) + diagnosticTest(#"\k{a-}"#, .expected("}"), .invalidNamedReference("a")) - diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) - diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal), .invalidNamedReference("a")) + diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal), .invalidReference(1)) diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) @@ -3078,65 +3214,67 @@ extension RegexTests { // MARK: Conditionals - diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) + diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3), unsupported: true) + diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3), unsupported: true) diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts // PCRE callouts - diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)")) - diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)")) + diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)"), unsupported: true) + diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)"), .expected(")"), unsupported: true) // Oniguruma named callouts - diagnosticTest("(*bar[", .expectedIdentifier(.onigurumaCalloutTag)) - diagnosticTest("(*bar[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) - diagnosticTest("(*bar{", .expectedCalloutArgument) - diagnosticTest("(*bar}", .expected(")")) - diagnosticTest("(*bar]", .expected(")")) + diagnosticTest("(*bar[", .expectedIdentifier(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(*bar[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(*bar{", .expectedCalloutArgument, .expected("}"), .expected(")"), unsupported: true) + diagnosticTest("(*bar}", .expected(")"), unsupported: true) + diagnosticTest("(*bar]", .expected(")"), unsupported: true) // Oniguruma 'of contents' callouts - diagnosticTest("(?{", .expected("}")) - diagnosticTest("(?{}", .expectedNonEmptyContents) - diagnosticTest("(?{x}", .expected(")")) - diagnosticTest("(?{x}}", .expected(")")) - diagnosticTest("(?{{x}}", .expected(")")) - diagnosticTest("(?{{x}", .expected("}")) - diagnosticTest("(?{x}[", .expectedIdentifier(.onigurumaCalloutTag)) - diagnosticTest("(?{x}[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) - diagnosticTest("(?{x}[a]", .expected(")")) - diagnosticTest("(?{x}[a]K", .expected(")")) - diagnosticTest("(?{x}[a]X", .expected(")")) - diagnosticTest("(?{{x}y}", .expected("}")) + diagnosticTest("(?{", .expected("}"), .expectedNonEmptyContents, .expected(")"), unsupported: true) + diagnosticTest("(?{}", .expectedNonEmptyContents, .expected(")"), unsupported: true) + diagnosticTest("(?{x}", .expected(")"), unsupported: true) + diagnosticTest("(?{x}}", .expected(")"), unsupported: true) + diagnosticTest("(?{{x}}", .expected(")"), unsupported: true) + + // TODO: We shouldn't be emitting both 'expected }' and 'expected }}' here. + diagnosticTest("(?{{x}", .expected("}"), .expected("}}"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[", .expectedIdentifier(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]", .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]K", .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]X", .expected(")"), unsupported: true) + diagnosticTest("(?{{x}y}", .expected("}"), .expected("}}"), .expected(")"), unsupported: true) // MARK: Backtracking directives - diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) - diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) - diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) - diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) - diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) - diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) - diagnosticTest("(*F)+?", .unsupported("backtracking directive")) - diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) + diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK"), unsupported: true) + diagnosticTest("(*:)", .expectedNonEmptyContents, unsupported: true) + diagnosticTest("(*MARK:a)?", .notQuantifiable, unsupported: true) + diagnosticTest("(*FAIL)+", .notQuantifiable, unsupported: true) + diagnosticTest("(*COMMIT:b)*", .notQuantifiable, unsupported: true) + diagnosticTest("(*PRUNE:a)??", .notQuantifiable, unsupported: true) + diagnosticTest("(*SKIP:a)*?", .notQuantifiable, unsupported: true) + diagnosticTest("(*F)+?", .notQuantifiable, unsupported: true) + diagnosticTest("(*:a){2}", .notQuantifiable, unsupported: true) // MARK: Oniguruma absent functions - diagnosticTest("(?~", .expected(")")) - diagnosticTest("(?~|", .expected(")")) - diagnosticTest("(?~|a|b|c)", .tooManyAbsentExpressionChildren(3)) - diagnosticTest("(?~||||)", .tooManyAbsentExpressionChildren(4)) + diagnosticTest("(?~", .expected(")"), unsupported: true) + diagnosticTest("(?~|", .expected(")"), unsupported: true) + diagnosticTest("(?~|a|b|c)", .tooManyAbsentExpressionChildren(3), unsupported: true) + diagnosticTest("(?~||||)", .tooManyAbsentExpressionChildren(4), unsupported: true) // MARK: Global matching options diagnosticTest("a(*CR)", .globalMatchingOptionNotAtStart("(*CR)")) - diagnosticTest("(*CR)a(*LF)", .globalMatchingOptionNotAtStart("(*LF)")) - diagnosticTest("(*LIMIT_HEAP)", .expected("=")) - diagnosticTest("(*LIMIT_DEPTH=", .expectedNumber("", kind: .decimal)) + diagnosticTest("(*CR)a(*LF)", .globalMatchingOptionNotAtStart("(*LF)"), unsupported: true) + diagnosticTest("(*LIMIT_HEAP)", .expected("="), .expectedNumber("", kind: .decimal), unsupported: true) + diagnosticTest("(*LIMIT_DEPTH=", .expectedNumber("", kind: .decimal), .expected(")"), unsupported: true) // TODO: This diagnostic could be better. - diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) + diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal), .expected(")"), unsupported: true) } func testDelimiterLexingErrors() { @@ -3182,4 +3320,41 @@ extension RegexTests { compilerInterfaceDiagnosticMessageTest( #"#/\u{}/#"#, "cannot parse regular expression: expected hexadecimal number") } + + func testParserFatalError() { + do { + var p = Parser(Source(""), syntax: .traditional) + p.advance() + try p.parse().ensureValid() + XCTFail("Expected unreachable") + } catch let err { + if !"\(err)".hasPrefix("UNREACHABLE") { + XCTFail("Expected unreachable \(err)") + } + } + + // Make sure fatal errors are preserved through lookaheads and backtracks. + do { + var p = Parser(Source(""), syntax: .traditional) + p.lookahead { p in + p.tryEating { p -> Void? in + p.lookahead { p in + p.advance() + p.lookahead { _ in } + p.tryEating { _ in } + } + return nil + } + } + if p.diags.diags.count != 1 { + XCTFail("Expected single fatal diagnostic") + } + try p.diags.throwAnyError() + XCTFail("Expected unreachable") + } catch let err { + if !"\(err)".hasPrefix("UNREACHABLE") { + XCTFail("Expected unreachable \(err)") + } + } + } } From 5967767d67edee5701b929483375f876bb198bc7 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:02 +0100 Subject: [PATCH 5/6] Improve recovery for identifiers and text segment options Scan to the closing delimiter of an invalid identifier, and better diagnose an invalid text segment option. --- .../Regex/Parse/Diagnostics.swift | 3 +++ .../Regex/Parse/LexicalAnalysis.swift | 22 ++++++++++++---- Tests/RegexTests/ParseTests.swift | 26 +++++++++---------- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index a23e0aed1..f5c0d7075 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -55,6 +55,7 @@ enum ParseError: Error, Hashable { case unknownGroupKind(String) case unknownCalloutKind(String) + case unknownTextSegmentMatchingOption(Character) case invalidMatchingOption(Character) case cannotRemoveMatchingOptionsAfterCaret @@ -166,6 +167,8 @@ extension ParseError: CustomStringConvertible { return "unknown group kind '(\(str)'" case let .unknownCalloutKind(str): return "unknown callout kind '\(str)'" + case let .unknownTextSegmentMatchingOption(m): + return "unknown text segment mode '\(m)'; expected 'w' or 'g'" case let .invalidMatchingOption(c): return "invalid matching option '\(c)'" case .cannotRemoveMatchingOptionsAfterCaret: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 05f066ff6..2168dbb03 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -812,18 +812,28 @@ extension Parser { case "S": return .asciiOnlySpace case "W": return .asciiOnlyWord case "y": - p.expect("{") + // Default to grapheme cluster if unknown. + let recoveryMode = OptKind.textSegmentGraphemeMode + guard p.expect("{") else { return recoveryMode } + + guard let optChar = p.tryEatWithLoc(), optChar.value != "}" else { + p.errorAtCurrentPosition(.expected("text segment mode")) + return recoveryMode + } let opt: OptKind - if p.tryEat("w") { + switch optChar.value { + case "w": opt = .textSegmentWordMode - } else { - p.expect("g") + case "g": opt = .textSegmentGraphemeMode + case let x: + p.error(.unknownTextSegmentMatchingOption(x), at: optChar.location) + opt = recoveryMode } p.expect("}") return opt - // Swift semantic level options + // Swift semantic level options case "X": return .graphemeClusterSemantics case "u": return .unicodeScalarSemantics case "b": return .byteSemantics @@ -958,6 +968,8 @@ extension Parser { } guard let str = p.tryEatPrefix(\.isWordCharacter) else { p.error(.identifierMustBeAlphaNumeric(kind), at: firstChar.location) + // Try skip ahead to the closing delimiter for better recovery. + _ = p.lexUntil { $0.src.isEmpty || $0.src.starts(with: ending) } return "" } return str.value diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 960a7214a..51654c057 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2802,11 +2802,10 @@ extension RegexTests { diagnosticTest(#"(?^"#, .expected(")")) diagnosticTest(#"(?^i"#, .expected(")")) - // TODO: These errors could be better. - diagnosticTest(#"(?y)"#, .expected("{"), .expected("g"), .expected("}"), unsupported: true) - diagnosticTest(#"(?y{)"#, .expected("g"), .expected("}"), unsupported: true) + diagnosticTest(#"(?y)"#, .expected("{"), unsupported: true) + diagnosticTest(#"(?y{)"#, .unknownTextSegmentMatchingOption(")"), .expected("}"), .expected(")"), unsupported: true) diagnosticTest(#"(?y{g)"#, .expected("}"), unsupported: true) - diagnosticTest(#"(?y{x})"#, .expected("g"), .expected("}"), .invalidMatchingOption("}"), unsupported: true) + diagnosticTest(#"(?y{x})"#, .unknownTextSegmentMatchingOption("x"), unsupported: true) diagnosticTest(#"(?P"#, .expected(")")) diagnosticTest(#"(?R"#, .expected(")"), unsupported: true) @@ -3086,18 +3085,17 @@ extension RegexTests { diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) - // TODO: We shouldn't emit the expected closing delimiter here and elsewhere. - diagnosticTest(#"(?<#>)"#, .expected(">"), .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?<#>)"#, .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'1A')"#, .identifierCannotStartWithNumber(.groupName)) // TODO: It might be better if tried to consume up to the closing `'` and // diagnosed an invalid group name based on that. diagnosticTest(#"(?'abc ')"#, .expected("'")) - diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName), .expected("'")) + diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName)) diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName), unsupported: true) - diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName), .expected("'"), unsupported: true) + diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) diagnosticTest(#"(?'a-b-c')"#, .expected("'"), unsupported: true) diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) @@ -3178,12 +3176,12 @@ extension RegexTests { diagnosticTest(#"\g{0}"#, .cannotReferToWholePattern) diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern, unsupported: true) - diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) - diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) - diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) - diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) - diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName), .expected("'"), unsupported: true) - diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName), .unbalancedEndOfGroup, .expected(")"), unsupported: true) + diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName), unsupported: true) diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName), .invalidNamedReference("1")) From 773dfeec483b0f7ea1f3e14f5acd6ae4acb457bb Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 29 Jun 2022 11:15:02 +0100 Subject: [PATCH 6/6] Drop the ASTStage parameter We now always run validation, which is fine because the resulting AST can still be returned. --- .../PatternConverter/PatternConverter.swift | 3 +- .../Regex/Parse/CompilerInterface.swift | 2 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 29 ++++--------------- Sources/_StringProcessing/Compiler.swift | 2 +- .../Regex/AnyRegexOutput.swift | 6 ++-- Sources/_StringProcessing/Regex/Core.swift | 4 +-- Tests/RegexTests/CaptureTests.swift | 2 +- Tests/RegexTests/DiagnosticTests.swift | 8 ++--- Tests/RegexTests/RenderDSLTests.swift | 2 +- 9 files changed, 19 insertions(+), 39 deletions(-) diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index 57e2f31dd..420a92752 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,8 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, .semantic, - experimentalSyntax ? .experimental : .traditional) + regex, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges if renderSourceRanges { diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 4ae518dcd..0856361d8 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input, .semantic) + let ast = try parseWithDelimiters(input) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 0011390c7..0aae031d5 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -636,37 +636,20 @@ extension Parser { } } -public enum ASTStage { - /// The regex is parsed, and a syntactically valid AST is returned. Otherwise - /// an error is thrown. This is useful for e.g syntax coloring. - case syntactic - - /// The regex is parsed, and a syntactically and semantically valid AST is - /// returned. Otherwise an error is thrown. A semantically valid AST has been - /// checked for e.g unsupported constructs and invalid backreferences. - case semantic -} - public func parseWithRecovery( - _ regex: S, _ syntax: SyntaxOptions, stage: ASTStage = .semantic + _ regex: S, _ syntax: SyntaxOptions ) -> AST where S.SubSequence == Substring { let source = Source(String(regex)) var parser = Parser(source, syntax: syntax) - let ast = parser.parse() - switch stage { - case .syntactic: - return ast - case .semantic: - return validate(ast) - } + return validate(parser.parse()) } public func parse( - _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions + _ regex: S, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { - try parseWithRecovery(regex, syntax, stage: stage).ensureValid() + try parseWithRecovery(regex, syntax).ensureValid() } extension StringProtocol { @@ -709,12 +692,12 @@ public func parseWithDelimitersWithRecovery( /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S, _ stage: ASTStage + _ regex: S ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) let syntax = defaultSyntaxOptions(delim, contents: contents) do { - return try parseWithRecovery(contents, syntax, stage: stage).ensureValid() + return try parseWithRecovery(contents, syntax).ensureValid() } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index c834aa95e..f47898e4e 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -62,7 +62,7 @@ func _compileRegex( _ syntax: SyntaxOptions = .traditional, _ semanticLevel: RegexSemanticLevel? = nil ) throws -> Executor { - let ast = try parse(regex, .semantic, syntax) + let ast = try parse(regex, syntax) let dsl: DSLTree switch semanticLevel?.base { diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 8f9d0e010..20731ad39 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -143,11 +143,11 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .semantic, .traditional)) + self.init(ast: try parse(pattern, .traditional)) } internal init(_ pattern: String, syntax: SyntaxOptions) throws { - self.init(ast: try parse(pattern, .semantic, syntax)) + self.init(ast: try parse(pattern, syntax)) } } @@ -161,7 +161,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .semantic, .traditional)) + self.init(ast: try parse(pattern, .traditional)) } /// Produces a regex that matches `verbatim` exactly, as though every diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index b27095f3f..0afe11c77 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -44,7 +44,7 @@ public struct Regex: RegexComponent { // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .semantic, .traditional)) + self.init(ast: try! parse(pattern, .traditional)) } // Compiler interface. Do not change independently. @@ -53,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern, .semantic)) + self.init(ast: try! parseWithDelimiters(pattern)) } public var regex: Regex { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index d72d9b10a..e46aae409 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -158,7 +158,7 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .semantic, .traditional) + let ast = try! parse(regex, .traditional) var capList = ast.captureList.withoutLocs // Peel off the whole match element. capList.captures.removeFirst() diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 0100a3a86..1a3606bf5 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .semantic, .traditional).root + let ast = try! parse("(a)", .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .semantic, .traditional).root else { + guard let ast = try? parse(str, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -53,9 +53,7 @@ extension RegexTests { flatTest("a|(b)|", ["a", "(b)", ""]) func renderTest(_ str: String, _ expected: [String]) { - let lines = try! parse( - str, .semantic, .traditional - )._render(in: str) + let lines = try! parse(str, .traditional)._render(in: str) func fail() { XCTFail(""" expected: diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 7bf8ba412..97ba3e333 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -21,7 +21,7 @@ func testConversion( _ expectedDSL: String, file: StaticString = #file, line: UInt = #line ) throws { - let ast = try _RegexParser.parse(regex, .semantic, .traditional) + let ast = try _RegexParser.parse(regex, .traditional) let actualDSL = renderAsBuilderDSL(ast: ast)._trimmingSuffix(while: \.isWhitespace) XCTAssertEqual(actualDSL, expectedDSL[...], file: file, line: line) }