diff --git a/Sources/PatternConverter/PatternConverter.swift b/Sources/PatternConverter/PatternConverter.swift index 57e2f31dd..420a92752 100644 --- a/Sources/PatternConverter/PatternConverter.swift +++ b/Sources/PatternConverter/PatternConverter.swift @@ -50,8 +50,7 @@ struct PatternConverter: ParsableCommand { print("Converting '\(delim)\(regex)\(delim)'") let ast = try _RegexParser.parse( - regex, .semantic, - experimentalSyntax ? .experimental : .traditional) + regex, experimentalSyntax ? .experimental : .traditional) // Show rendered source ranges if renderSourceRanges { diff --git a/Sources/_RegexParser/Regex/AST/AST.swift b/Sources/_RegexParser/Regex/AST/AST.swift index 44bc10828..43bb460c3 100644 --- a/Sources/_RegexParser/Regex/AST/AST.swift +++ b/Sources/_RegexParser/Regex/AST/AST.swift @@ -15,16 +15,31 @@ public struct AST: Hashable { public var root: AST.Node public var globalOptions: GlobalMatchingOptionSequence? + public var diags: Diagnostics - public init(_ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?) { + public init( + _ root: AST.Node, globalOptions: GlobalMatchingOptionSequence?, + diags: Diagnostics + ) { self.root = root self.globalOptions = globalOptions + self.diags = diags } } extension AST { /// Whether this AST tree contains at least one capture nested inside of it. public var hasCapture: Bool { root.hasCapture } + + /// Whether this AST tree is either syntactically or semantically invalid. + public var isInvalid: Bool { diags.hasAnyError } + + /// If the AST is invalid, throws an error. Otherwise, returns self. + @discardableResult + public func ensureValid() throws -> AST { + try diags.throwAnyError() + return self + } } extension AST { @@ -265,12 +280,12 @@ extension AST { public enum Kind: Hashable { // \n \gn \g{n} \g \g'n' (?n) (?(n)... // Oniguruma: \k, \k'n' - case absolute(Int) + case absolute(AST.Atom.Number) // \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n) // (?(+n)... (?(-n)... // Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n' - case relative(Int) + case relative(AST.Atom.Number) // \k \k'name' \g{name} \k{name} (?P=name) // \g \g'name' (?&name) (?P>name) @@ -278,20 +293,33 @@ extension AST { case named(String) /// (?R), (?(R)..., which are equivalent to (?0), (?(0)... - static var recurseWholePattern: Kind { .absolute(0) } + static func recurseWholePattern(_ loc: SourceLocation) -> Kind { + .absolute(.init(0, at: loc)) + } + + /// Whether this is a reference that recurses the whole pattern, rather + /// than a group. + public var recursesWholePattern: Bool { + switch self { + case .absolute(let a): + return a.value == 0 + default: + return false + } + } } public var kind: Kind /// An additional specifier supported by Oniguruma that specifies what /// recursion level the group being referenced belongs to. - public var recursionLevel: Located? + public var recursionLevel: AST.Atom.Number? /// The location of the inner numeric or textual reference, e.g the location /// of '-2' in '\g{-2}'. Note this includes the recursion level for e.g /// '\k'. public var innerLoc: SourceLocation - public init(_ kind: Kind, recursionLevel: Located? = nil, + public init(_ kind: Kind, recursionLevel: AST.Atom.Number? = nil, innerLoc: SourceLocation) { self.kind = kind self.recursionLevel = recursionLevel @@ -300,7 +328,7 @@ extension AST { /// Whether this is a reference that recurses the whole pattern, rather than /// a group. - public var recursesWholePattern: Bool { kind == .recurseWholePattern } + public var recursesWholePattern: Bool { kind.recursesWholePattern } } /// A set of global matching options in a regular expression literal. diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index a349c2a85..f1419ad78 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -80,6 +80,9 @@ extension AST { // (?i), (?i-m), ... case changeMatchingOptions(MatchingOptionSequence) + + // An invalid atom created by a parse error. + case invalid } } } @@ -104,6 +107,7 @@ extension AST.Atom { case .any: return nil case .startOfLine: return nil case .endOfLine: return nil + case .invalid: return nil } } @@ -113,6 +117,18 @@ extension AST.Atom { } extension AST.Atom { + public struct Number: Hashable { + /// The value, which may be `nil` in an invalid AST, e.g the parser expected + /// a number at a given location, or the parsed number overflowed. + public var value: Int? + public var location: SourceLocation + + public init(_ value: Int?, at location: SourceLocation) { + self.value = value + self.location = location + } + } + public struct Scalar: Hashable { public var value: UnicodeScalar public var location: SourceLocation @@ -453,6 +469,9 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by Java. case javaSpecial(JavaSpecial) + /// An invalid property that has been diagnosed by the parser. + case invalid(key: String?, value: String) + public enum MapKind: Hashable { case lowercase case uppercase @@ -558,7 +577,7 @@ extension AST.Atom { /// A PCRE callout written `(?C...)` public struct PCRE: Hashable { public enum Argument: Hashable { - case number(Int) + case number(AST.Atom.Number) case string(String) } public var arg: AST.Located @@ -789,7 +808,7 @@ extension AST.Atom { case .scalarSequence, .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions: + .changeMatchingOptions, .invalid: return nil } } @@ -803,6 +822,10 @@ extension AST.Atom { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: return true + case .scalarSequence: + // Unsupported for now (and we will diagnose as such), but treat it as a + // valid range operand for better recovery. + return true default: return false } @@ -837,7 +860,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective, .changeMatchingOptions: + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } diff --git a/Sources/_RegexParser/Regex/AST/Conditional.swift b/Sources/_RegexParser/Regex/AST/Conditional.swift index c382a25b6..3a9a43be8 100644 --- a/Sources/_RegexParser/Regex/AST/Conditional.swift +++ b/Sources/_RegexParser/Regex/AST/Conditional.swift @@ -66,11 +66,13 @@ extension AST.Conditional { extension AST.Conditional.Condition { public struct PCREVersionNumber: Hashable { - public var major: Int - public var minor: Int + public var major: AST.Atom.Number + public var minor: AST.Atom.Number public var location: SourceLocation - public init(major: Int, minor: Int, _ location: SourceLocation) { + public init( + major: AST.Atom.Number, minor: AST.Atom.Number, _ location: SourceLocation + ) { self.major = major self.minor = minor self.location = location diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index db813b407..c85c2b3d1 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -175,13 +175,13 @@ extension AST { } public enum Kind: Hashable { /// (*LIMIT_DEPTH=d) - case limitDepth(Located) + case limitDepth(AST.Atom.Number) /// (*LIMIT_HEAP=d) - case limitHeap(Located) + case limitHeap(AST.Atom.Number) /// (*LIMIT_MATCH=d) - case limitMatch(Located) + case limitMatch(AST.Atom.Number) /// (*NOTEMPTY) case notEmpty diff --git a/Sources/_RegexParser/Regex/AST/Quantification.swift b/Sources/_RegexParser/Regex/AST/Quantification.swift index c6d4f0101..7bc2e6620 100644 --- a/Sources/_RegexParser/Regex/AST/Quantification.swift +++ b/Sources/_RegexParser/Regex/AST/Quantification.swift @@ -37,13 +37,13 @@ extension AST { } public enum Amount: Hashable { - case zeroOrMore // * - case oneOrMore // + - case zeroOrOne // ? - case exactly(Located) // {n} - case nOrMore(Located) // {n,} - case upToN(Located) // {,n} - case range(Located, Located) // {n,m} + case zeroOrMore // * + case oneOrMore // + + case zeroOrOne // ? + case exactly(AST.Atom.Number) // {n} + case nOrMore(AST.Atom.Number) // {n,} + case upToN(AST.Atom.Number) // {,n} + case range(AST.Atom.Number, AST.Atom.Number) // {n,m} } public enum Kind: String, Hashable { @@ -58,7 +58,7 @@ extension AST { extension AST.Quantification.Amount { /// The bounds. - public var bounds: (atLeast: Int, atMost: Int?) { + public var bounds: (atLeast: Int?, atMost: Int?) { switch self { case .zeroOrMore: return (0, nil) case .oneOrMore: return (1, nil) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index fb122e027..bd635c83f 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -9,12 +9,12 @@ // //===----------------------------------------------------------------------===// -extension Source { +extension Parser { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, requireInPrefix: Bool = false, match: (String) throws -> T? - ) rethrows -> T? { + _ str: String, requireInPrefix: Bool = false, match: (String) -> T? + ) -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. @@ -22,12 +22,12 @@ extension Source { .lowercased() if requireInPrefix { guard str.hasPrefix("in") else { return nil } - return try match(String(str.dropFirst(2))) + return match(String(str.dropFirst(2))) } - if let m = try match(str) { + if let m = match(str) { return m } - if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) { + if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) { return m } return nil @@ -736,31 +736,40 @@ extension Source { return (major, minor) } - static func classifyCharacterPropertyValueOnly( - _ value: String - ) throws -> PropertyKind { - guard !value.isEmpty else { throw ParseError.emptyProperty } + mutating func classifyCharacterPropertyValueOnly( + _ valueLoc: Located + ) -> PropertyKind { + let value = valueLoc.value + + func error(_ err: ParseError) -> PropertyKind { + self.error(err, at: valueLoc.location) + return .invalid(key: nil, value: value) + } + + guard !value.isEmpty else { + return error(.emptyProperty) + } // Some special cases defined by UTS#18 (and Oniguruma for 'ANY' and // 'Assigned'). - if let specialProp = classifySpecialPropValue(value) { + if let specialProp = Self.classifySpecialPropValue(value) { return specialProp } // The following properties we can infer keys/values for. - if let prop = classifyBoolProperty(value) { + if let prop = Self.classifyBoolProperty(value) { return .binary(prop, value: true) } - if let cat = classifyGeneralCategory(value) { + if let cat = Self.classifyGeneralCategory(value) { return .generalCategory(cat) } - if let script = classifyScriptProperty(value) { + if let script = Self.classifyScriptProperty(value) { return .scriptExtension(script) } - if let posix = classifyPOSIX(value) { + if let posix = Self.classifyPOSIX(value) { return .posix(posix) } - if let block = classifyBlockProperty(value, valueOnly: true) { + if let block = Self.classifyBlockProperty(value, valueOnly: true) { return .block(block) } @@ -776,53 +785,67 @@ extension Source { // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? - throw ParseError.unknownProperty(key: nil, value: value) + return error(.unknownProperty(key: nil, value: value)) } - static func classifyCharacterProperty( - key: String, value: String - ) throws -> PropertyKind { - guard !key.isEmpty && !value.isEmpty else { throw ParseError.emptyProperty } + mutating func classifyCharacterProperty( + key keyLoc: Located, value valueLoc: Located + ) -> PropertyKind { + let key = keyLoc.value + let value = valueLoc.value + + func valueError(_ err: ParseError) -> PropertyKind { + error(err, at: valueLoc.location) + return .invalid(key: key, value: value) + } + + guard !key.isEmpty else { + error(.emptyProperty, at: keyLoc.location) + return .invalid(key: key, value: value) + } + guard !value.isEmpty else { + return valueError(.emptyProperty) + } - if let prop = classifyBoolProperty(key), - let isTrue = classifyCharacterPropertyBoolValue(value) { + if let prop = Self.classifyBoolProperty(key), + let isTrue = Self.classifyCharacterPropertyBoolValue(value) { return .binary(prop, value: isTrue) } // This uses the aliases defined in // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt. - let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in + let match = Self.withNormalizedForms(key) { normalizedKey -> PropertyKind? in switch normalizedKey { case "script", "sc": - guard let script = classifyScriptProperty(value) else { - throw ParseError.unrecognizedScript(value) + guard let script = Self.classifyScriptProperty(value) else { + return valueError(.unrecognizedScript(value)) } return .script(script) case "scriptextensions", "scx": - guard let script = classifyScriptProperty(value) else { - throw ParseError.unrecognizedScript(value) + guard let script = Self.classifyScriptProperty(value) else { + return valueError(.unrecognizedScript(value)) } return .scriptExtension(script) case "gc", "generalcategory": - guard let cat = classifyGeneralCategory(value) else { - throw ParseError.unrecognizedCategory(value) + guard let cat = Self.classifyGeneralCategory(value) else { + return valueError(.unrecognizedCategory(value)) } return .generalCategory(cat) case "age": - guard let (major, minor) = parseAge(value) else { - throw ParseError.invalidAge(value) + guard let (major, minor) = Self.parseAge(value) else { + return valueError(.invalidAge(value)) } return .age(major: major, minor: minor) case "name", "na": return .named(value) case "numericvalue", "nv": guard let numericValue = Double(value) else { - throw ParseError.invalidNumericValue(value) + return valueError(.invalidNumericValue(value)) } return .numericValue(numericValue) case "numerictype", "nt": - guard let type = classifyNumericType(value) else { - throw ParseError.unrecognizedNumericType(value) + guard let type = Self.classifyNumericType(value) else { + return valueError(.unrecognizedNumericType(value)) } return .numericType(type) case "slc", "simplelowercasemapping": @@ -833,13 +856,13 @@ extension Source { return .mapping(.titlecase, value) case "ccc", "canonicalcombiningclass": guard let cccValue = UInt8(value), cccValue <= 254 else { - throw ParseError.invalidCCC(value) + return valueError(.invalidCCC(value)) } return .ccc(.init(rawValue: cccValue)) case "blk", "block": - guard let block = classifyBlockProperty(value, valueOnly: false) else { - throw ParseError.unrecognizedBlock(value) + guard let block = Self.classifyBlockProperty(value, valueOnly: false) else { + return valueError(.unrecognizedBlock(value)) } return .block(block) default: @@ -852,6 +875,8 @@ extension Source { } // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? - throw ParseError.unknownProperty(key: key, value: value) + error(.unknownProperty(key: key, value: value), + at: keyLoc.location.union(with: valueLoc.location)) + return .invalid(key: key, value: value) } } diff --git a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift index 4ae518dcd..0856361d8 100644 --- a/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift +++ b/Sources/_RegexParser/Regex/Parse/CompilerInterface.swift @@ -96,7 +96,7 @@ public func swiftCompilerParseRegexLiteral( _ input: String, captureBufferOut: UnsafeMutableRawBufferPointer ) throws -> (regexToEmit: String, version: Int) { do { - let ast = try parseWithDelimiters(input, .semantic) + let ast = try parseWithDelimiters(input) // Serialize the capture structure for later type inference. assert(captureBufferOut.count >= input.utf8.count) ast.captureStructure.encode(to: captureBufferOut) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 5bca2ad13..f5c0d7075 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -55,6 +55,7 @@ enum ParseError: Error, Hashable { case unknownGroupKind(String) case unknownCalloutKind(String) + case unknownTextSegmentMatchingOption(Character) case invalidMatchingOption(Character) case cannotRemoveMatchingOptionsAfterCaret @@ -166,6 +167,8 @@ extension ParseError: CustomStringConvertible { return "unknown group kind '(\(str)'" case let .unknownCalloutKind(str): return "unknown callout kind '\(str)'" + case let .unknownTextSegmentMatchingOption(m): + return "unknown text segment mode '\(m)'; expected 'w' or 'g'" case let .invalidMatchingOption(c): return "invalid matching option '\(c)'" case .cannotRemoveMatchingOptionsAfterCaret: @@ -179,7 +182,7 @@ extension ParseError: CustomStringConvertible { case .unsupportedDotNetSubtraction: return "subtraction with '-' is unsupported; use '--' instead" case .emptyProperty: - return "empty property" + return "expected property name" case .unknownProperty(let key, let value): if let key = key { return "unknown character property '\(key)=\(value)'" @@ -242,9 +245,128 @@ extension ParseError: CustomStringConvertible { } } -// TODO: Fixits, notes, etc. +/// A fatal error that indicates broken logic in the parser. +enum FatalParseError: Hashable, Error { + case unreachable(String) +} + +extension FatalParseError: CustomStringConvertible { + var description: String { + switch self { + case .unreachable(let str): + return "UNREACHABLE: \(str)" + } + } +} + +// MARK: Diagnostic handling + +/// A diagnostic to emit. +public struct Diagnostic: Hashable { + public let behavior: Behavior + public let message: String + public let location: SourceLocation + + // TODO: Fixits, notes, etc. + + // The underlying ParseError if applicable. This is used for testing. + internal let underlyingParseError: ParseError? + + init(_ behavior: Behavior, _ message: String, at loc: SourceLocation, + underlyingParseError: ParseError? = nil) { + self.behavior = behavior + self.message = message + self.location = loc + self.underlyingParseError = underlyingParseError + } + + public var isAnyError: Bool { behavior.isAnyError } +} + +extension Diagnostic { + public enum Behavior: Hashable { + case fatalError, error, warning + + public var isAnyError: Bool { + switch self { + case .fatalError, .error: + return true + case .warning: + return false + } + } + } +} -// TODO: Diagnostics engine, recorder, logger, or similar. +/// A collection of diagnostics to emit. +public struct Diagnostics: Hashable { + public private(set) var diags = [Diagnostic]() + public init() {} + public init(_ diags: [Diagnostic]) { + self.diags = diags + } + /// Add a new diagnostic to emit. + public mutating func append(_ diag: Diagnostic) { + diags.append(diag) + } + /// Add all the diagnostics of another diagnostic collection. + public mutating func append(contentsOf other: Diagnostics) { + diags.append(contentsOf: other.diags) + } + + /// Add all the new fatal error diagnostics of another diagnostic collection. + /// This assumes that `other` was the same as `self`, but may have additional + /// diagnostics added to it. + public mutating func appendNewFatalErrors(from other: Diagnostics) { + let newDiags = other.diags.dropFirst(diags.count) + for diag in newDiags where diag.behavior == .fatalError { + append(diag) + } + } + + /// Whether any error is present. This includes fatal errors. + public var hasAnyError: Bool { + diags.contains(where: { $0.isAnyError }) + } + + /// Whether any fatal error is present. + public var hasFatalError: Bool { + diags.contains(where: { $0.behavior == .fatalError }) + } + + /// If any error diagnostic has been added, throw it as an Error. + func throwAnyError() throws { + for diag in diags where diag.isAnyError { + struct ErrorDiagnostic: Error, CustomStringConvertible { + var diag: Diagnostic + var description: String { diag.message } + } + throw Source.LocatedError(ErrorDiagnostic(diag: diag), diag.location) + } + } +} + +// MARK: Diagnostic construction + +extension Diagnostic { + init(_ err: ParseError, at loc: SourceLocation) { + self.init(.error, "\(err)", at: loc, underlyingParseError: err) + } + + init(_ err: FatalParseError, at loc: SourceLocation) { + self.init(.fatalError, "\(err)", at: loc) + } +} + +extension Diagnostics { + mutating func error(_ err: ParseError, at loc: SourceLocation) { + append(Diagnostic(err, at: loc)) + } + + mutating func fatal(_ err: FatalParseError, at loc: SourceLocation) { + append(Diagnostic(err, at: loc)) + } +} diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index be6f13fc7..2168dbb03 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -16,178 +16,244 @@ concerns upon request. API convention: -- lexFoo will try to consume a foo and return it if successful, throws errors -- expectFoo will consume a foo, throwing errors, and throw an error if it can't -- eat() and tryEat() is still used by the parser as a character-by-character interface +- lexFoo will try to consume a foo and return it if successful, otherwise returns nil +- expectFoo will consume a foo, diagnosing an error if unsuccessful */ -extension Error { - func addingLocation(_ loc: Range) -> Error { - // If we're already a LocatedError, don't change the location. - if self is LocatedErrorProtocol { - return self - } - return Source.LocatedError(self, loc) - } -} +extension Parser { + typealias Located = Source.Located + typealias Location = Source.Location + typealias LocatedError = Source.LocatedError + typealias Char = Source.Char -extension Source { // MARK: - recordLoc - /// Record source loc before processing and return - /// or throw the value/error with source locations. + /// Attach a source location to the parsed contents of a given function. fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> T - ) rethrows -> Located { - let start = currentPosition - do { - let result = try f(&self) - return Located(result, Location(start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, Location(start.. T + ) -> Located { + let start = src.currentPosition + let result = f(&self) + return Located(result, loc(start)) + } + + /// Attach a source location to the parsed contents of a given function. fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> T? - ) rethrows -> Located? { - let start = currentPosition - do { - guard let result = try f(&self) else { return nil } - return Located(result, start.. T? + ) -> Located? { + let start = src.currentPosition + guard let result = f(&self) else { return nil } + return Located(result, loc(start)) } - /// Record source loc before processing and return - /// or throw the value/error with source locations. + /// Attach a source location to the parsed contents of a given function. @discardableResult fileprivate mutating func recordLoc( - _ f: (inout Self) throws -> () - ) rethrows -> SourceLocation { - let start = currentPosition - do { - try f(&self) - return SourceLocation(start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start.. () + ) -> SourceLocation { + let start = src.currentPosition + f(&self) + return loc(start) + } +} + +// MARK: Backtracking routines + +extension Parser { + /// Attempt to make a series of lexing steps in `body`, returning `nil` if + /// unsuccesful, which will revert the parser back to its previous state. + mutating func tryEating( + _ body: (inout Self) -> T? + ) -> T? { + var current = self + guard let result = body(&self) else { + // Fatal errors are always preserved. + current.diags.appendNewFatalErrors(from: diags) + self = current + return nil } + return result + } + + /// Perform a lookahead using a temporary source. Within the body of the + /// lookahead, any modifications to the source will not be reflected outside + /// the body. + mutating func lookahead(_ body: (inout Self) -> T) -> T { + var p = self + let result = body(&p) + // Fatal errors are always preserved. + diags.appendNewFatalErrors(from: p.diags) + return result } } // MARK: - Consumption routines -extension Source { +extension Parser { typealias Quant = AST.Quantification - /// Throws an expected character error if not matched + /// Expect to eat a given character, diagnosing an error and returning + /// `false` if unsuccessful, `true` otherwise. @discardableResult - mutating func expect(_ c: Character) throws -> SourceLocation { - try recordLoc { src in - guard src.tryEat(c) else { - throw ParseError.expected(String(c)) - } + mutating func expect(_ c: Character) -> Bool { + guard tryEat(c) else { + errorAtCurrentPosition(.expected(String(c))) + return false + } + return true + } + + /// Same as `expect`, but with a source location. + mutating func expectWithLoc(_ c: Character) -> Located { + recordLoc { + $0.expect(c) } } - /// Throws an expected character error if not matched + /// Expect to eat a sequence of characters, diagnosing an error and returning + /// `false` if unsuccessful, `true` otherwise. + @discardableResult mutating func expect( sequence c: C - ) throws where C.Element == Character { - _ = try recordLoc { src in - guard src.tryEat(sequence: c) else { - throw ParseError.expected(String(c)) - } + ) -> Bool where C.Element == Character { + guard tryEat(sequence: c) else { + errorAtCurrentPosition(.expected(String(c))) + return false } + return true } - /// Throws an unexpected end of input error if not matched - /// - /// Note: much of the time, but not always, we can vend a more specific error. + /// Diagnoses an error and returns `false` if the end of input has been + /// reached. Otherwise returns `true`. + @discardableResult mutating func expectNonEmpty( _ error: ParseError = .unexpectedEndOfInput - ) throws { - _ = try recordLoc { src in - if src.isEmpty { throw error } + ) -> Bool { + guard !src.isEmpty else { + errorAtCurrentPosition(error) + return false } + return true } - mutating func tryEatNonEmpty(sequence c: C) throws -> Bool - where C.Element == Char - { - try expectNonEmpty(.expected(String(c))) - return tryEat(sequence: c) + /// Attempt to eat a sequence of characters, additionally diagnosing if the + /// end of the source has been reached. + mutating func tryEatNonEmpty( + sequence c: C + ) -> Bool where C.Element == Char { + expectNonEmpty(.expected(String(c))) && tryEat(sequence: c) } - mutating func tryEatNonEmpty(_ c: Char) throws -> Bool { - try tryEatNonEmpty(sequence: String(c)) + /// Returns the next character, or `nil` if the end of the source has been + /// reached. + func peek() -> Char? { src.peek() } + + /// Same as `peek()`, but with the source location of the next character. + func peekWithLoc() -> Located? { + peek().map { c in + let nextPos = src.input.index(after: src.currentPosition) + return Located(c, Location(src.currentPosition ..< nextPos)) + } } - /// Attempt to make a series of lexing steps in `body`, returning `nil` if - /// unsuccesful, which will revert the source back to its previous state. If - /// an error is thrown, the source will not be reverted. - mutating func tryEating( - _ body: (inout Source) throws -> T? - ) rethrows -> T? { - // We don't revert the source if an error is thrown, as it's useful to - // maintain the source location in that case. - let current = self - guard let result = try body(&self) else { - self = current - return nil + /// Advance the input `n` characters ahead. + mutating func advance(_ n: Int = 1) { + guard src.tryAdvance(n) else { + unreachable("Advancing beyond end!") + + // Empty out the remaining characters. + src.tryAdvance(src._slice.count) + return } - return result } - /// Perform a lookahead using a temporary source. Within the body of the - /// lookahead, any modifications to the source will not be reflected outside - /// the body. - func lookahead(_ body: (inout Source) throws -> T) rethrows -> T { - var src = self - return try body(&src) + /// Try to eat any character, returning `nil` if the input has been exhausted. + mutating func tryEat() -> Char? { + guard let char = peek() else { return nil } + advance() + return char + } + + /// Same as `tryEat()`, but with the source location of the eaten character. + mutating func tryEatWithLoc() -> Located? { + recordLoc { $0.tryEat() } + } + + /// Attempt to eat the given character, returning `true` if successful, + /// `false` otherwise. + mutating func tryEat(_ c: Char) -> Bool { + guard peek() == c else { return false } + advance() + return true } /// Attempt to eat the given character, returning its source location if /// successful, `nil` otherwise. mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? { - let start = currentPosition + let start = src.currentPosition guard tryEat(c) else { return nil } - return .init(start ..< currentPosition) + return .init(start ..< src.currentPosition) + } + + /// Attempt to eat a character if it matches a given predicate, returning + /// `true` if the character was eaten, or `false` if the character did not + /// meet the predicate. + mutating func tryEat(where pred: (Char) -> Bool) -> Bool { + guard let next = peek(), pred(next) else { return false } + advance() + return true + } + + /// Attempt to eat a sequence of characters, returning `true` if successful. + mutating func tryEat( + sequence c: C + ) -> Bool where C.Element == Char { + guard src.starts(with: c) else { return false } + advance(c.count) + return true + } + + /// Attempt to eat any of the given characters, returning the one that was + /// eaten. + mutating func tryEat( + anyOf set: C + ) -> Char? where C.Element == Char { + guard let c = peek(), set.contains(c) else { return nil } + advance() + return c + } + + /// Attempt to eat any of the given characters, returning the one that was + /// eaten. + mutating func tryEat(anyOf set: Char...) -> Char? { + tryEat(anyOf: set) + } + + /// Eat up to `count` characters, returning the range of characters eaten. + mutating func eat(upToCount count: Int) -> Located { + recordLoc { $0.src.eat(upToCount: count).string } } /// Attempt to eat a given prefix that satisfies a given predicate, with the /// source location recorded. - mutating func tryEatLocatedPrefix( + mutating func tryEatPrefix( maxLength: Int? = nil, _ f: (Char) -> Bool ) -> Located? { - let result = recordLoc { src in - src.tryEatPrefix(maxLength: maxLength, f) - } - guard let result = result else { return nil } - return result.map(\.string) + recordLoc { $0.src.tryEatPrefix(maxLength: maxLength, f)?.string } } - /// Throws an expected ASCII character error if not matched - mutating func expectASCII() throws -> Located { - try recordLoc { src in - guard let c = src.peek() else { - throw ParseError.unexpectedEndOfInput + /// Attempts to eat an ASCII value, diagnosing an error and returning `nil` + /// if unsuccessful. + mutating func expectASCII() -> Located? { + recordLoc { p in + guard let c = p.tryEat() else { + p.errorAtCurrentPosition(.unexpectedEndOfInput) + return nil } guard c.isASCII else { - throw ParseError.expectedASCII(c) + p.errorAtCurrentPosition(.expectedASCII(c)) + return nil } - src.eat(asserting: c) return c } } @@ -218,31 +284,43 @@ enum IdentifierKind { case onigurumaCalloutTag } -extension Source { +extension Parser { /// Validate a string of digits as a particular radix, and return the number, - /// or throw an error if the string is malformed or would overflow the number - /// type. - private static func validateNumber( - _ str: String, _: Num.Type, _ kind: RadixKind - ) throws -> Num { + /// or diagnose an error if the string is malformed or would overflow the + /// number type. + private mutating func validateNumber( + _ locStr: Located, _: Num.Type, _ kind: RadixKind + ) -> Num? { + let str = locStr.value guard !str.isEmpty && str.all(kind.characterFilter) else { - throw ParseError.expectedNumber(str, kind: kind) + error(.expectedNumber(str, kind: kind), at: locStr.location) + return nil } guard let i = Num(str, radix: kind.radix) else { - throw ParseError.numberOverflow(str) + error(.numberOverflow(str), at: locStr.location) + return nil } return i } /// Validate a string of digits as a unicode scalar of a particular radix, and - /// return the scalar value, or throw an error if the string is malformed or - /// would overflow the scalar. - private static func validateUnicodeScalar( + /// return the scalar value, or diagnose an error if the string is malformed + /// or would overflow the scalar. + private mutating func validateUnicodeScalar( _ str: Source.Located, _ kind: RadixKind - ) throws -> AST.Atom.Scalar { - let num = try validateNumber(str.value, UInt32.self, kind) + ) -> AST.Atom.Scalar { + func nullScalar() -> AST.Atom.Scalar { + // For now, return a null scalar in the case of an error. This should be + // benign as it shouldn't affect other validation logic. + // TODO: Should we store nil like we do with regular numbers? + return .init(UnicodeScalar(0), str.location) + } + guard let num = validateNumber(str, UInt32.self, kind) else { + return nullScalar() + } guard let scalar = Unicode.Scalar(num) else { - throw ParseError.misc("Invalid scalar value U+\(num.hexStr)") + error(.misc("Invalid scalar value U+\(num.hexStr)"), at: str.location) + return nullScalar() } return .init(scalar, str.location) } @@ -251,51 +329,39 @@ extension Source { /// /// Returns: `nil` if there's no number, otherwise the number /// - /// Throws on overflow + /// Diagnoses on overflow /// - private mutating func lexNumber( - _ ty: Num.Type, _ kind: RadixKind - ) throws -> Located? { - try recordLoc { src in - guard let str = src.tryEatPrefix(kind.characterFilter)?.string else { - return nil - } - guard let i = Num(str, radix: kind.radix) else { - throw ParseError.numberOverflow(str) - } - return i + mutating func lexNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number? { + guard let str = tryEatPrefix(kind.characterFilter) else { + return nil } + guard let i = Int(str.value, radix: kind.radix) else { + error(.numberOverflow(str.value), at: str.location) + return .init(nil, at: str.location) + } + return .init(i, at: str.location) } - /// Try to eat a number off the front. - /// - /// Returns: `nil` if there's no number, otherwise the number - /// - /// Throws on overflow - /// - mutating func lexNumber() throws -> Located? { - try lexNumber(Int.self, .decimal) - } - - mutating func expectNumber() throws -> Located { - guard let num = try lexNumber() else { - throw ParseError.expectedNumber("", kind: .decimal) + /// Expect a number of a given `kind`, diagnosing if a number cannot be + /// parsed. + mutating func expectNumber(_ kind: RadixKind = .decimal) -> AST.Atom.Number { + guard let num = lexNumber(kind) else { + errorAtCurrentPosition(.expectedNumber("", kind: kind)) + return .init(nil, at: loc(src.currentPosition)) } return num } /// Eat a scalar value from hexadecimal notation off the front - private mutating func expectUnicodeScalar( - numDigits: Int - ) throws -> AST.Atom.Scalar { - let str = try recordLoc { src -> String in - let str = src.eat(upToCount: numDigits).string - guard str.count == numDigits else { - throw ParseError.expectedNumDigits(str, numDigits) + mutating func expectUnicodeScalar(numDigits: Int) -> AST.Atom.Scalar { + let str = recordLoc { p -> String in + let str = p.eat(upToCount: numDigits) + if str.value.count != numDigits { + p.error(.expectedNumDigits(str.value, numDigits), at: str.location) } - return str + return str.value } - return try Source.validateUnicodeScalar(str, .hex) + return validateUnicodeScalar(str, .hex) } /// Try to lex a seqence of hex digit unicode scalars. @@ -305,41 +371,40 @@ extension Source { /// mutating func expectUnicodeScalarSequence( eating ending: Character - ) throws -> AST.Atom.Kind { - try recordLoc { src in - var scalars = [AST.Atom.Scalar]() - var trivia = [AST.Trivia]() - - // Eat up any leading whitespace. - if let t = src.lexWhitespace() { trivia.append(t) } - - while true { - let str = src.lexUntil { src in - // Hit the ending, stop lexing. - if src.isEmpty || src.peek() == ending { - return true - } - // Eat up trailing whitespace, and stop lexing to record the scalar. - if let t = src.lexWhitespace() { - trivia.append(t) - return true - } - // Not the ending or trivia, must be a digit of the scalar. - return false + ) -> AST.Atom.Kind { + var scalars = [AST.Atom.Scalar]() + var trivia = [AST.Trivia]() + + // Eat up any leading whitespace. + if let t = lexWhitespace() { trivia.append(t) } + + while true { + let str = lexUntil { p in + // Hit the ending, stop lexing. + if p.src.isEmpty || p.peek() == ending { + return true } - guard !str.value.isEmpty else { break } - scalars.append(try Source.validateUnicodeScalar(str, .hex)) - } - guard !scalars.isEmpty else { - throw ParseError.expectedNumber("", kind: .hex) + // Eat up trailing whitespace, and stop lexing to record the scalar. + if let t = p.lexWhitespace() { + trivia.append(t) + return true + } + // Not the ending or trivia, must be a digit of the scalar. + return false } - try src.expect(ending) + guard !str.value.isEmpty else { break } + scalars.append(validateUnicodeScalar(str, .hex)) + } + expect(ending) - if scalars.count == 1 { - return .scalar(scalars[0]) - } - return .scalarSequence(.init(scalars, trivia: trivia)) - }.value + if scalars.isEmpty { + errorAtCurrentPosition(.expectedNumber("", kind: .hex)) + return .scalar(.init(UnicodeScalar(0), loc(src.currentPosition))) + } + if scalars.count == 1 { + return .scalar(scalars[0]) + } + return .scalarSequence(.init(scalars, trivia: trivia)) } /// Try to eat a scalar off the front, starting from after the backslash and @@ -353,62 +418,59 @@ extension Source { /// | 'o{' OctalDigit{1...} '}' /// | '0' OctalDigit{0...3} /// - mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? { - try recordLoc { src in - try src.tryEating { src in + mutating func lexUnicodeScalar() -> AST.Atom.Kind? { + tryEating { p in - func nullScalar() -> AST.Atom.Kind { - let pos = src.currentPosition - return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos))) - } + func nullScalar() -> AST.Atom.Scalar { + .init(UnicodeScalar(0), p.loc(p.src.currentPosition)) + } - // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. - switch src.tryEat() { + // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set. + switch p.tryEat() { // Hex numbers. - case "u" where src.tryEat("{"): - return try src.expectUnicodeScalarSequence(eating: "}") + case "u" where p.tryEat("{"): + return p.expectUnicodeScalarSequence(eating: "}") - case "x" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) + case "x" where p.tryEat("{"): + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .hex)) - case "x": - // \x expects *up to* 2 digits. - guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit) - else { - // In PCRE, \x without any valid hex digits is \u{0}. - // TODO: This doesn't appear to be followed by ICU or Oniguruma, so - // could be changed to throw an error if we had a parsing mode for - // them. - return nullScalar() - } - return .scalar(try Source.validateUnicodeScalar(digits, .hex)) + case "x": + // \x expects *up to* 2 digits. + guard let digits = p.tryEatPrefix(maxLength: 2, \.isHexDigit) + else { + // In PCRE, \x without any valid hex digits is \u{0}. + // TODO: This doesn't appear to be followed by ICU or Oniguruma, so + // could be changed to diagnose an error if we had a parsing mode for + // them. + return .scalar(nullScalar()) + } + return .scalar(p.validateUnicodeScalar(digits, .hex)) - case "u": - return .scalar(try src.expectUnicodeScalar(numDigits: 4)) - case "U": - return .scalar(try src.expectUnicodeScalar(numDigits: 8)) + case "u": + return .scalar(p.expectUnicodeScalar(numDigits: 4)) + case "U": + return .scalar(p.expectUnicodeScalar(numDigits: 8)) // Octal numbers. - case "o" where src.tryEat("{"): - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .octal)) - - case "0": - // We can read *up to* 3 more octal digits. - // FIXME: PCRE can only read up to 2 octal digits, if we get a strict - // PCRE mode, we should limit it here. - guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit) - else { - return nullScalar() - } - return .scalar(try Source.validateUnicodeScalar(digits, .octal)) - - default: - return nil + case "o" where p.tryEat("{"): + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .octal)) + + case "0": + // We can read *up to* 3 more octal digits. + // FIXME: PCRE can only read up to 2 octal digits, if we get a strict + // PCRE mode, we should limit it here. + guard let digits = p.tryEatPrefix(maxLength: 3, \.isOctalDigit) + else { + return .scalar(nullScalar()) } + return .scalar(p.validateUnicodeScalar(digits, .octal)) + + default: + return nil } - }.value + } } /// Try to consume a quantifier @@ -417,21 +479,20 @@ extension Source { /// QuantKind -> '?' | '+' /// mutating func lexQuantifier( - context: ParsingContext - ) throws -> (Located, Located, [AST.Trivia])? { + ) -> (Located, Located, [AST.Trivia])? { var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } + if let t = lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = try recordLoc { src in - if src.tryEat("*") { return .zeroOrMore } - if src.tryEat("+") { return .oneOrMore } - if src.tryEat("?") { return .zeroOrOne } + let amt: Located? = recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return try src.tryEating { src in - guard src.tryEat("{"), - let range = try src.lexRange(context: context, trivia: &trivia), - src.tryEat("}") + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") else { return nil } return range.value } @@ -439,11 +500,11 @@ extension Source { guard let amt = amt else { return nil } // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) } + if let t = lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { src in - if src.tryEat("?") { return .reluctant } - if src.tryEat("+") { return .possessive } + let kind: Located = recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } return .eager } @@ -456,44 +517,40 @@ extension Source { /// | ExpRange /// ExpRange -> '..<' | '...' /// | '..<' | '...' ? - mutating func lexRange( - context: ParsingContext, trivia: inout [AST.Trivia] - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - if let t = src.lexWhitespace() { trivia.append(t) } + mutating func lexRange(trivia: inout [AST.Trivia]) -> Located? { + recordLoc { p in + p.tryEating { p in + if let t = p.lexWhitespace() { trivia.append(t) } - let lowerOpt = try src.lexNumber() + let lowerOpt = p.lexNumber() - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } // ',' or '...' or '..<' or nothing - // TODO: We ought to try and consume whitespace here and emit a - // diagnostic for the user warning them that it would cause the range to - // be treated as literal. let closedRange: Bool? - if src.tryEat(",") { + if p.tryEat(",") { closedRange = true - } else if context.experimentalRanges && src.tryEat(".") { - try src.expect(".") - if src.tryEat(".") { + } else if p.context.experimentalRanges && p.tryEat(".") { + p.expect(".") + if p.tryEat(".") { closedRange = true } else { - try src.expect("<") + p.expect("<") closedRange = false } } else { closedRange = nil } - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } - let upperOpt = try src.lexNumber()?.map { upper in + var upperOpt = p.lexNumber() + if closedRange == false { // If we have an open range, the upper bound should be adjusted down. - closedRange == true ? upper : upper - 1 + upperOpt?.value? -= 1 } - if let t = src.lexWhitespace() { trivia.append(t) } + if let t = p.lexWhitespace() { trivia.append(t) } switch (lowerOpt, closedRange, upperOpt) { case let (l?, nil, nil): @@ -506,7 +563,8 @@ extension Source { return .range(l, u) case (nil, nil, _?): - fatalError("Didn't lex lower bound, but lexed upper bound?") + p.unreachable("Didn't lex lower bound, but lexed upper bound?") + return nil default: return nil } @@ -515,34 +573,31 @@ extension Source { } private mutating func lexUntil( - _ predicate: (inout Source) throws -> Bool - ) rethrows -> Located { - // We track locations outside of recordLoc, as the predicate may advance the - // input when we hit the end, and we don't want that to affect the location - // of what was lexed in the `result`. We still want the recordLoc call to - // attach locations to any thrown errors though. + _ predicate: (inout Self) -> Bool + ) -> Located { + // We track locations without using recordLoc, as the predicate may advance + // the input when we hit the end, and we don't want that to affect the + // location of what was lexed in the `result`. // TODO: We should find a better way of doing this, `lexUntil` seems full // of footguns. - let start = currentPosition - var end = currentPosition + let start = src.currentPosition + var end = src.currentPosition var result = "" - try recordLoc { src in - while try !predicate(&src) { - result.append(src.eat()) - end = src.currentPosition - } + while !predicate(&self), let c = tryEat() { + result.append(c) + end = src.currentPosition } return .init(result, start ..< end) } - private mutating func lexUntil(eating end: String) throws -> Located { - try lexUntil { try $0.tryEatNonEmpty(sequence: end) } + private mutating func lexUntil(eating end: String) -> Located { + lexUntil { $0.tryEatNonEmpty(sequence: end) } } private mutating func lexUntil( eating end: Character - ) throws -> Located { - try lexUntil(eating: String(end)) + ) -> Located { + lexUntil(eating: String(end)) } /// Expect a linear run of non-nested non-empty content ending with a given @@ -551,28 +606,28 @@ extension Source { private mutating func expectQuoted( endingWith endSingle: String, count: Int = 1, ignoreEscaped: Bool = false, eatEnding: Bool = true - ) throws -> Located { + ) -> Located { let end = String(repeating: endSingle, count: count) - let result = try recordLoc { src -> String in - try src.lexUntil { src in - if src.starts(with: end) { + let result = recordLoc { p -> String in + p.lexUntil { p in + if p.src.starts(with: end) { return true } - try src.expectNonEmpty(.expected(endSingle)) + guard p.expectNonEmpty(.expected(endSingle)) else { return true } // Ignore escapes if we're allowed to. lexUntil will consume the next // character. - if ignoreEscaped, src.tryEat("\\") { - try src.expectNonEmpty(.expectedEscape) + if ignoreEscaped, p.tryEat("\\") { + guard p.expectNonEmpty(.expectedEscape) else { return true } } return false }.value } - guard !result.value.isEmpty else { - throw ParseError.expectedNonEmptyContents + if result.value.isEmpty { + error(.expectedNonEmptyContents, at: result.location) } if eatEnding { - try expect(sequence: end) + expect(sequence: end) } return result } @@ -589,28 +644,28 @@ extension Source { /// /// TODO: Need to support some escapes /// - mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { - let str = try recordLoc { src -> String? in - if src.tryEat(sequence: #"\Q"#) { - let contents = src.lexUntil { src in - src.isEmpty || src.tryEat(sequence: #"\E"#) - }.value + mutating func lexQuote() -> AST.Quote? { + let str = recordLoc { p -> String? in + if p.tryEat(sequence: #"\Q"#) { + let contents = p.lexUntil { p in + p.src.isEmpty || p.tryEat(sequence: #"\E"#) + } // In multi-line literals, the quote may not span multiple lines. - if context.syntax.contains(.multilineCompilerLiteral), - contents.spansMultipleLinesInRegexLiteral { - throw ParseError.quoteMayNotSpanMultipleLines + if p.context.syntax.contains(.multilineCompilerLiteral), + contents.value.spansMultipleLinesInRegexLiteral { + p.error(.quoteMayNotSpanMultipleLines, at: contents.location) } // The sequence must not be empty in a custom character class. - if context.isInCustomCharacterClass && contents.isEmpty { - throw ParseError.expectedNonEmptyContents + if p.context.isInCustomCharacterClass && contents.value.isEmpty { + p.error(.expectedNonEmptyContents, at: contents.location) } - return contents + return contents.value } - if context.experimentalQuotes, src.tryEat("\"") { + if p.context.experimentalQuotes, p.tryEat("\"") { // TODO: Can experimental quotes be empty? - return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value + return p.expectQuoted(endingWith: "\"", ignoreEscaped: true).value } return nil } @@ -622,16 +677,13 @@ extension Source { /// /// Interpolation -> '<{' String '}>' /// - mutating func lexInterpolation() throws -> AST.Interpolation? { - let contents = try recordLoc { src -> String? in - try src.tryEating { src in - guard src.tryEat(sequence: "<{") else { return nil } - _ = src.lexUntil { $0.isEmpty || $0.starts(with: "}>") } - guard src.tryEat(sequence: "}>") else { return nil } - - // Not currently supported. We error here instead of during Sema to - // get a better error for something like `(<{)}>`. - throw ParseError.unsupported("interpolation") + mutating func lexInterpolation() -> AST.Interpolation? { + let contents = recordLoc { p -> String? in + p.tryEating { p in + guard p.tryEat(sequence: "<{") else { return nil } + let contents = p.lexUntil { $0.src.isEmpty || $0.src.starts(with: "}>") } + guard p.tryEat(sequence: "}>") else { return nil } + return contents.value } } guard let contents = contents else { return nil } @@ -652,34 +704,34 @@ extension Source { /// /// TODO: Swift-style nested comments, line-ending comments, etc /// - mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? { - let trivia: Located? = try recordLoc { src in - if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") { - return try src.lexUntil(eating: ")").value + mutating func lexComment() -> AST.Trivia? { + let trivia: Located? = recordLoc { p in + if !p.context.isInCustomCharacterClass && p.tryEat(sequence: "(?#") { + return p.lexUntil(eating: ")").value } - if context.experimentalComments, src.tryEat(sequence: "/*") { - return try src.lexUntil(eating: "*/").value + if p.context.experimentalComments, p.tryEat(sequence: "/*") { + return p.lexUntil(eating: "*/").value } - if context.endOfLineComments, src.tryEat("#") { + if p.context.endOfLineComments, p.tryEat("#") { // Try eat until we either exhaust the input, or hit a newline. Note // that the definition of newline can be altered depending on the global // matching options. By default we consider a newline to be `\n` or // `\r`. - return src.lexUntil { src in - if src.isEmpty { return true } - switch context.newlineMode { + return p.lexUntil { p in + if p.src.isEmpty { return true } + switch p.context.newlineMode { case .carriageReturnOnly: - return src.tryEat("\r") + return p.tryEat("\r") case .linefeedOnly: - return src.tryEat("\n") + return p.tryEat("\n") case .carriageAndLinefeedOnly: - return src.tryEat("\r\n") + return p.tryEat("\r\n") case .anyCarriageReturnOrLinefeed: - return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil + return p.tryEat(anyOf: "\r", "\n", "\r\n") != nil case .anyUnicode: - return src.tryEat(where: \.isNewline) + return p.tryEat(where: \.isNewline) case .nulCharacter: - return src.tryEat("\0") + return p.tryEat("\0") } }.value } @@ -694,9 +746,7 @@ extension Source { /// Whitespace -> WhitespaceChar+ /// /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set - mutating func lexNonSemanticWhitespace( - context: ParsingContext - ) -> AST.Trivia? { + mutating func lexNonSemanticWhitespace() -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } // FIXME: PCRE only treats space and tab characters as whitespace when @@ -713,10 +763,7 @@ extension Source { /// Unlike `lexNonSemanticWhitespace`, this will always attempt to lex /// whitespace. mutating func lexWhitespace() -> AST.Trivia? { - let trivia: Located? = recordLoc { src in - src.tryEatPrefix(\.isPatternWhitespace)?.string - } - guard let trivia = trivia else { return nil } + guard let trivia = tryEatPrefix(\.isPatternWhitespace) else { return nil } return AST.Trivia(trivia) } @@ -724,11 +771,11 @@ extension Source { /// /// Trivia -> Comment | Whitespace /// - mutating func lexTrivia(context: ParsingContext) throws -> AST.Trivia? { - if let comment = try lexComment(context: context) { + mutating func lexTrivia() -> AST.Trivia? { + if let comment = lexComment() { return comment } - if let whitespace = lexNonSemanticWhitespace(context: context) { + if let whitespace = lexNonSemanticWhitespace() { return whitespace } return nil @@ -739,55 +786,61 @@ extension Source { /// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w' /// | 'D' | 'P' | 'S' | 'W' | 'y{' ('g' | 'w') '}' /// - mutating func lexMatchingOption() throws -> AST.MatchingOption? { + mutating func lexMatchingOption() -> AST.MatchingOption? { typealias OptKind = AST.MatchingOption.Kind - let locOpt = try recordLoc { src -> OptKind? in - func advanceAndReturn(_ o: OptKind) -> OptKind { - src.advance() - return o - } - guard let c = src.peek() else { return nil } - switch c { - // PCRE options. - case "i": return advanceAndReturn(.caseInsensitive) - case "J": return advanceAndReturn(.allowDuplicateGroupNames) - case "m": return advanceAndReturn(.multiline) - case "n": return advanceAndReturn(.namedCapturesOnly) - case "s": return advanceAndReturn(.singleLine) - case "U": return advanceAndReturn(.reluctantByDefault) - case "x": - src.advance() - return src.tryEat("x") ? .extraExtended : .extended - - // ICU options. - case "w": return advanceAndReturn(.unicodeWordBoundaries) - - // Oniguruma options. - case "D": return advanceAndReturn(.asciiOnlyDigit) - case "P": return advanceAndReturn(.asciiOnlyPOSIXProps) - case "S": return advanceAndReturn(.asciiOnlySpace) - case "W": return advanceAndReturn(.asciiOnlyWord) - case "y": - src.advance() - try src.expect("{") - let opt: OptKind - if src.tryEat("w") { - opt = .textSegmentWordMode - } else { - try src.expect("g") - opt = .textSegmentGraphemeMode + let locOpt = recordLoc { p -> OptKind? in + p.tryEating { p in + guard let c = p.tryEat() else { return nil } + switch c { + // PCRE options. + case "i": return .caseInsensitive + case "J": return .allowDuplicateGroupNames + case "m": return .multiline + case "n": return .namedCapturesOnly + case "s": return .singleLine + case "U": return .reluctantByDefault + case "x": + return p.tryEat("x") ? .extraExtended : .extended + + // ICU options. + case "w": return .unicodeWordBoundaries + + // Oniguruma options. + case "D": return .asciiOnlyDigit + case "P": return .asciiOnlyPOSIXProps + case "S": return .asciiOnlySpace + case "W": return .asciiOnlyWord + case "y": + // Default to grapheme cluster if unknown. + let recoveryMode = OptKind.textSegmentGraphemeMode + guard p.expect("{") else { return recoveryMode } + + guard let optChar = p.tryEatWithLoc(), optChar.value != "}" else { + p.errorAtCurrentPosition(.expected("text segment mode")) + return recoveryMode + } + let opt: OptKind + switch optChar.value { + case "w": + opt = .textSegmentWordMode + case "g": + opt = .textSegmentGraphemeMode + case let x: + p.error(.unknownTextSegmentMatchingOption(x), at: optChar.location) + opt = recoveryMode + } + p.expect("}") + return opt + + // Swift semantic level options + case "X": return .graphemeClusterSemantics + case "u": return .unicodeScalarSemantics + case "b": return .byteSemantics + + default: + return nil } - try src.expect("}") - return opt - - // Swift semantic level options - case "X": return advanceAndReturn(.graphemeClusterSemantics) - case "u": return advanceAndReturn(.unicodeScalarSemantics) - case "b": return advanceAndReturn(.byteSemantics) - - default: - return nil } } guard let locOpt = locOpt else { return nil } @@ -799,109 +852,98 @@ extension Source { /// MatchingOptionSeq -> '^' MatchingOption* | MatchingOption+ /// | MatchingOption* '-' MatchingOption* /// - mutating func lexMatchingOptionSequence( - context: ParsingContext - ) throws -> AST.MatchingOptionSequence? { + mutating func lexMatchingOptionSequence() -> AST.MatchingOptionSequence? { // PCRE accepts '(?)' // TODO: This is a no-op, should we warn? if peek() == ")" { return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: []) } - let ateCaret = recordLoc { $0.tryEat("^") } + let caret = tryEatWithLoc("^") // TODO: Warn on duplicate options, and options appearing in both adding // and removing lists? var adding: [AST.MatchingOption] = [] - while let opt = try lexMatchingOption() { + while let opt = lexMatchingOption() { adding.append(opt) } - // If the sequence begun with a caret '^', options can only be added, so - // we're done. - if ateCaret.value { - if peek() == "-" { - throw ParseError.cannotRemoveMatchingOptionsAfterCaret - } - return .init(caretLoc: ateCaret.location, adding: adding, minusLoc: nil, - removing: []) - } - // Try to lex options to remove. - let ateMinus = recordLoc { $0.tryEat("-") } - if ateMinus.value { - var removing: [AST.MatchingOption] = [] - while let opt = try lexMatchingOption() { + var removing: [AST.MatchingOption] = [] + let minus = tryEatWithLoc("-") + if minus != nil { + if let caret = caret { + // Options cannot be removed if '^' is used. + error(.cannotRemoveMatchingOptionsAfterCaret, at: caret) + } + while let opt = lexMatchingOption() { // Text segment options can only be added, they cannot be removed // with (?-), they should instead be set to a different mode. if opt.isTextSegmentMode { - throw ParseError.cannotRemoveTextSegmentOptions + error(.cannotRemoveTextSegmentOptions, at: opt.location) } // Matching semantics options can only be added, not removed. if opt.isSemanticMatchingLevel { - throw ParseError.cannotRemoveSemanticsOptions + error(.cannotRemoveSemanticsOptions, at: opt.location) } removing.append(opt) } - return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, - removing: removing) } - guard !adding.isEmpty else { return nil } - return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) + // We must have lexed at least something to proceed. + guard caret != nil || minus != nil || !adding.isEmpty else { return nil } + return .init( + caretLoc: caret, adding: adding, minusLoc: minus, removing: removing) } /// A matching option changing atom. /// /// '(?' MatchingOptionSeq ')' /// - mutating func lexChangeMatchingOptionAtom( - context: ParsingContext - ) throws -> AST.MatchingOptionSequence? { - try tryEating { src in - guard src.tryEat(sequence: "(?"), - let seq = try src.lexMatchingOptionSequence(context: context) + mutating func lexChangeMatchingOptionAtom() -> AST.MatchingOptionSequence? { + tryEating { p in + guard p.tryEat(sequence: "(?"), let seq = p.lexMatchingOptionSequence() else { return nil } - try src.expect(")") + p.expect(")") return seq } } /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { - tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } - if src.tryEat(sequence: "atomic:") { + if p.tryEat(sequence: "atomic:") { return .atomicNonCapturing } - if src.tryEat(sequence: "pla:") || - src.tryEat(sequence: "positive_lookahead:") { + if p.tryEat(sequence: "pla:") || + p.tryEat(sequence: "positive_lookahead:") { return .lookahead } - if src.tryEat(sequence: "nla:") || - src.tryEat(sequence: "negative_lookahead:") { + if p.tryEat(sequence: "nla:") || + p.tryEat(sequence: "negative_lookahead:") { return .negativeLookahead } - if src.tryEat(sequence: "plb:") || - src.tryEat(sequence: "positive_lookbehind:") { + if p.tryEat(sequence: "plb:") || + p.tryEat(sequence: "positive_lookbehind:") { return .lookbehind } - if src.tryEat(sequence: "nlb:") || - src.tryEat(sequence: "negative_lookbehind:") { + if p.tryEat(sequence: "nlb:") || + p.tryEat(sequence: "negative_lookbehind:") { return .negativeLookbehind } - if src.tryEat(sequence: "napla:") || - src.tryEat(sequence: "non_atomic_positive_lookahead:") { + if p.tryEat(sequence: "napla:") || + p.tryEat(sequence: "non_atomic_positive_lookahead:") { return .nonAtomicLookahead } - if src.tryEat(sequence: "naplb:") || - src.tryEat(sequence: "non_atomic_positive_lookbehind:") { + if p.tryEat(sequence: "naplb:") || + p.tryEat(sequence: "non_atomic_positive_lookbehind:") { return .nonAtomicLookbehind } - if src.tryEat(sequence: "sr:") || src.tryEat(sequence: "script_run:") { + if p.tryEat(sequence: "sr:") || p.tryEat(sequence: "script_run:") { return .scriptRun } - if src.tryEat(sequence: "asr:") || - src.tryEat(sequence: "atomic_script_run:") { + if p.tryEat(sequence: "asr:") || + p.tryEat(sequence: "atomic_script_run:") { return .atomicScriptRun } return nil @@ -914,34 +956,30 @@ extension Source { /// private mutating func expectIdentifier( _ kind: IdentifierKind, endingWith ending: String, eatEnding: Bool = true - ) throws -> Located { - let str = try recordLoc { src -> String in - if src.isEmpty || src.tryEat(sequence: ending) { - throw ParseError.expectedIdentifier(kind) + ) -> Located { + let str = recordLoc { p -> String in + guard !p.src.isEmpty && !p.src.starts(with: ending) else { + p.errorAtCurrentPosition(.expectedIdentifier(kind)) + return "" } - if src.peek()!.isNumber { - throw ParseError.identifierCannotStartWithNumber(kind) + let firstChar = p.peekWithLoc()! + if firstChar.value.isNumber { + p.error(.identifierCannotStartWithNumber(kind), at: firstChar.location) } - guard let str = src.tryEatPrefix(\.isWordCharacter)?.string else { - throw ParseError.identifierMustBeAlphaNumeric(kind) + guard let str = p.tryEatPrefix(\.isWordCharacter) else { + p.error(.identifierMustBeAlphaNumeric(kind), at: firstChar.location) + // Try skip ahead to the closing delimiter for better recovery. + _ = p.lexUntil { $0.src.isEmpty || $0.src.starts(with: ending) } + return "" } - return str + return str.value } if eatEnding { - try expect(sequence: ending) + expect(sequence: ending) } return str } - /// Try to consume an identifier, returning `nil` if unsuccessful. - private mutating func lexIdentifier( - _ kind: IdentifierKind, endingWith end: String, eatEnding: Bool = true - ) -> Located? { - tryEating { src in - try? src.expectIdentifier(kind, endingWith: end, eatEnding: eatEnding) - } - } - /// Consume a named group field, producing either a named capture or balanced /// capture. /// @@ -952,23 +990,23 @@ extension Source { /// private mutating func expectNamedGroup( endingWith ending: String - ) throws -> AST.Group.Kind { - func lexBalanced(_ lhs: Located? = nil) throws -> AST.Group.Kind? { + ) -> AST.Group.Kind { + func lexBalanced(_ lhs: Located? = nil) -> AST.Group.Kind? { // If we have a '-', this is a .NET-style 'balanced group'. guard let dash = tryEatWithLoc("-") else { return nil } - let rhs = try expectIdentifier(.groupName, endingWith: ending) + let rhs = expectIdentifier(.groupName, endingWith: ending) return .balancedCapture(.init(name: lhs, dash: dash, priorName: rhs)) } // Lex a group name, trying to lex a '-rhs' for a balanced capture group // both before and after. - if let b = try lexBalanced() { return b } - let name = try expectIdentifier( + if let b = lexBalanced() { return b } + let name = expectIdentifier( .groupName, endingWith: ending, eatEnding: false ) - if let b = try lexBalanced(name) { return b } + if let b = lexBalanced(name) { return b } - try expect(sequence: ending) + expect(sequence: ending) return .namedCapture(name) } @@ -989,15 +1027,13 @@ extension Source { /// need to be parsed earlier than the group check, as /// comments, like quotes, cannot be quantified. /// - mutating func lexGroupStart( - context: ParsingContext - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in + mutating func lexGroupStart() -> Located? { + recordLoc { p in + p.tryEating { p in // Explicitly spelled out PRCE2 syntax for some groups. This needs to be // done before group-like atoms, as it uses the '(*' syntax, which is // otherwise a group-like atom. - if let g = src.lexExplicitPCRE2GroupStart() { return g } + if let g = p.lexExplicitPCRE2GroupStart() { return g } // There are some atoms that syntactically look like groups, bail here // if we see any. Care needs to be taken here as e.g a group starting @@ -1005,54 +1041,57 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } + guard !p.shouldLexGroupLikeAtom() else { return nil } - guard src.tryEat("(") else { return nil } - if src.tryEat("?") { - if src.tryEat(":") { return .nonCapture } - if src.tryEat("|") { return .nonCaptureReset } - if src.tryEat(">") { return .atomicNonCapturing } - if src.tryEat("=") { return .lookahead } - if src.tryEat("!") { return .negativeLookahead } - if src.tryEat("*") { return .nonAtomicLookahead } + guard p.tryEat("(") else { return nil } + if p.tryEat("?") { + if p.tryEat(":") { return .nonCapture } + if p.tryEat("|") { return .nonCaptureReset } + if p.tryEat(">") { return .atomicNonCapturing } + if p.tryEat("=") { return .lookahead } + if p.tryEat("!") { return .negativeLookahead } + if p.tryEat("*") { return .nonAtomicLookahead } - if src.tryEat(sequence: "<=") { return .lookbehind } - if src.tryEat(sequence: "") + if p.tryEat("<") || p.tryEat(sequence: "P<") { + return p.expectNamedGroup(endingWith: ">") } - if src.tryEat("'") { - return try src.expectNamedGroup(endingWith: "'") + if p.tryEat("'") { + return p.expectNamedGroup(endingWith: "'") } // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). - if let seq = try src.lexMatchingOptionSequence(context: context) { - guard src.tryEat(":") else { - if let next = src.peek() { - throw ParseError.invalidMatchingOption(next) + if let seq = p.lexMatchingOptionSequence() { + if !p.tryEat(":") { + if let next = p.peekWithLoc() { + p.error(.invalidMatchingOption(next.value), at: next.location) + } else { + p.errorAtCurrentPosition(.expected(")")) } - throw ParseError.expected(")") } return .changeMatchingOptions(seq) } - guard let next = src.peek() else { - throw ParseError.expectedGroupSpecifier + if let next = p.peekWithLoc() { + p.error(.unknownGroupKind("?\(next.value)"), at: next.location) + } else { + p.errorAtCurrentPosition(.expectedGroupSpecifier) } - throw ParseError.unknownGroupKind("?\(next)") + return .nonCapture } // (_:) - if context.experimentalCaptures && src.tryEat(sequence: "_:") { + if p.context.experimentalCaptures && p.tryEat(sequence: "_:") { return .nonCapture } // TODO: (name:) // If (?n) is set, a bare (...) group is non-capturing. - if context.syntax.contains(.namedCapturesOnly) { + if p.context.syntax.contains(.namedCapturesOnly) { return .nonCapture } return .capture @@ -1065,11 +1104,12 @@ extension Source { /// PCREVersionNumber -> . /// private mutating func expectPCREVersionNumber( - ) throws -> AST.Conditional.Condition.PCREVersionNumber { - let nums = try recordLoc { src -> (major: Int, minor: Int) in - let major = try src.expectNumber().value - try src.expect(".") - let minor = try src.expectNumber().value + ) -> AST.Conditional.Condition.PCREVersionNumber { + let nums = recordLoc { p -> (major: AST.Atom.Number, + minor: AST.Atom.Number) in + let major = p.expectNumber() + p.expect(".") + let minor = p.expectNumber() return (major, minor) } return .init(major: nums.value.major, minor: nums.value.minor, @@ -1081,14 +1121,14 @@ extension Source { /// PCREVersionCheck -> '>'? '=' PCREVersionNumber /// private mutating func expectPCREVersionCheck( - ) throws -> AST.Conditional.Condition.Kind { + ) -> AST.Conditional.Condition.Kind { typealias Kind = AST.Conditional.Condition.PCREVersionCheck.Kind - let kind = try recordLoc { src -> Kind in - let greaterThan = src.tryEat(">") - try src.expect("=") + let kind = recordLoc { p -> Kind in + let greaterThan = p.tryEat(">") + p.expect("=") return greaterThan ? .greaterThanOrEqual : .equal } - return .pcreVersionCheck(.init(kind, try expectPCREVersionNumber())) + return .pcreVersionCheck(.init(kind, expectPCREVersionNumber())) } /// Try to lex a known condition (excluding group conditions). @@ -1103,46 +1143,44 @@ extension Source { /// | NumberRef /// | NameRef /// - private mutating func lexKnownCondition( - context: ParsingContext - ) throws -> AST.Conditional.Condition? { + private mutating func lexKnownCondition() -> AST.Conditional.Condition? { typealias ConditionKind = AST.Conditional.Condition.Kind - let kind = try recordLoc { src -> ConditionKind? in - try src.tryEating { src in + let kind = recordLoc { p -> ConditionKind? in + p.tryEating { p in // PCRE recursion check. - if src.tryEat("R") { - if src.tryEat("&") { + if p.tryEat("R") { + if p.tryEat("&") { return .groupRecursionCheck( - try src.expectNamedReference(endingWith: ")", eatEnding: false)) + p.expectNamedReference(endingWith: ")", eatEnding: false)) } - if let num = try src.lexNumber() { + if let num = p.lexNumber() { return .groupRecursionCheck( - .init(.absolute(num.value), innerLoc: num.location)) + .init(.absolute(num), innerLoc: num.location)) } return .recursionCheck } - if let open = src.tryEat(anyOf: "<", "'") { + if let open = p.tryEat(anyOf: "<", "'") { // In PCRE, this can only be a named reference. In Oniguruma, it can // also be a numbered reference. - let closing = String(Source.getClosingDelimiter(for: open)) + let closing = String(p.getClosingDelimiter(for: open)) return .groupMatched( - try src.expectNamedOrNumberedReference(endingWith: closing)) + p.expectNamedOrNumberedReference(endingWith: closing)) } // PCRE group definition and version check. - if src.tryEat(sequence: "DEFINE") { + if p.tryEat(sequence: "DEFINE") { return .defineGroup } - if src.tryEat(sequence: "VERSION") { - return try src.expectPCREVersionCheck() + if p.tryEat(sequence: "VERSION") { + return p.expectPCREVersionCheck() } // If we have a numbered reference, this is a check to see if a group // matched. Oniguruma also permits a recursion level here. - if let num = try src.lexNumberedReference(allowRecursionLevel: true) { + if let num = p.lexNumberedReference(allowRecursionLevel: true) { return .groupMatched(num) } @@ -1153,9 +1191,9 @@ extension Source { // FIXME: This should apply to future groups too. // TODO: We should probably advise users to use the more explicit // syntax. - let nameRef = src.lexNamedReference( + let nameRef = p.lexNamedReference( endingWith: ")", eatEnding: false, allowRecursionLevel: true) - if let nameRef = nameRef, context.isPriorGroupRef(nameRef.kind) { + if let nameRef = nameRef, p.context.isPriorGroupRef(nameRef.kind) { return .groupMatched(nameRef) } return nil @@ -1169,14 +1207,11 @@ extension Source { /// /// KnownConditionalStart -> '(?(' KnownCondition ')' /// - mutating func lexKnownConditionalStart( - context: ParsingContext - ) throws -> AST.Conditional.Condition? { - try tryEating { src in - guard src.tryEat(sequence: "(?("), - let cond = try src.lexKnownCondition(context: context) + mutating func lexKnownConditionalStart() -> AST.Conditional.Condition? { + tryEating { p in + guard p.tryEat(sequence: "(?("), let cond = p.lexKnownCondition() else { return nil } - try src.expect(")") + p.expect(")") return cond } } @@ -1185,12 +1220,10 @@ extension Source { /// /// GroupCondStart -> '(?' GroupStart /// - mutating func lexGroupConditionalStart( - context: ParsingContext - ) throws -> Located? { - try tryEating { src in - guard src.tryEat(sequence: "(?") else { return nil } - return try src.lexGroupStart(context: context) + mutating func lexGroupConditionalStart() -> Located? { + tryEating { p in + guard p.tryEat(sequence: "(?") else { return nil } + return p.lexGroupStart() } } @@ -1200,24 +1233,24 @@ extension Source { /// mutating func lexAbsentFunctionStart( ) -> Located? { - recordLoc { src in - if src.tryEat(sequence: "(?~|") { return .withPipe } - if src.tryEat(sequence: "(?~") { return .withoutPipe } + recordLoc { p in + if p.tryEat(sequence: "(?~|") { return .withPipe } + if p.tryEat(sequence: "(?~") { return .withoutPipe } return nil } } mutating func lexCustomCCStart() -> Located? { - recordLoc { src in + recordLoc { p in // Make sure we don't have a POSIX character property. This may require // walking to its ending to make sure we have a closing ':]', as otherwise // we have a custom character class. // TODO: This behavior seems subtle, could we warn? - guard !src.canLexPOSIXCharacterProperty() else { + guard !p.canLexPOSIXCharacterProperty() else { return nil } - if src.tryEat("[") { - return src.tryEat("^") ? .inverted : .normal + if p.tryEat("[") { + return p.tryEat("^") ? .inverted : .normal } return nil } @@ -1227,21 +1260,21 @@ extension Source { /// /// CustomCCBinOp -> '--' | '~~' | '&&' /// - mutating func lexCustomCCBinOp() throws -> Located? { - recordLoc { src in + mutating func lexCustomCCBinOp() -> Located? { + recordLoc { p in // TODO: Perhaps a syntax options check (!PCRE) // TODO: Better AST types here - guard let binOp = src.peekCCBinOp() else { return nil } - try! src.expect(sequence: binOp.rawValue) + guard let binOp = p.peekCCBinOp() else { return nil } + p.expect(sequence: binOp.rawValue) return binOp } } // Check to see if we can lex a binary operator. func peekCCBinOp() -> CustomCC.SetOp? { - if starts(with: "--") { return .subtraction } - if starts(with: "~~") { return .symmetricDifference } - if starts(with: "&&") { return .intersection } + if src.starts(with: "--") { return .subtraction } + if src.starts(with: "~~") { return .symmetricDifference } + if src.starts(with: "&&") { return .intersection } return nil } @@ -1250,52 +1283,40 @@ extension Source { /// /// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass /// - func canLexDotNetCharClassSubtraction( - context: ParsingContext - ) -> SourceLocation? { - lookahead { src in + mutating func canLexDotNetCharClassSubtraction() -> SourceLocation? { + lookahead { p in // We can lex '-' as a .NET subtraction if it precedes a custom character // class. - while (try? src.lexTrivia(context: context)) != nil {} - guard let dashLoc = src.tryEatWithLoc("-") else { return nil } - while (try? src.lexTrivia(context: context)) != nil {} - guard src.lexCustomCCStart() != nil else { return nil } + while p.lexTrivia() != nil {} + guard let dashLoc = p.tryEatWithLoc("-") else { return nil } + while p.lexTrivia() != nil {} + guard p.lexCustomCCStart() != nil else { return nil } return dashLoc } } private mutating func lexPOSIXCharacterProperty( - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard src.tryEat(sequence: "[:") else { return nil } - let inverted = src.tryEat("^") + ) -> Located? { + recordLoc { p in + p.tryEating { p in + guard p.tryEat(sequence: "[:") else { return nil } + let inverted = p.tryEat("^") // Note we lex the contents and ending *before* classifying, because we // want to bail with nil if we don't have the right ending. This allows // the lexing of a custom character class if we don't have a ':]' // ending. - let (key, value) = src.lexCharacterPropertyKeyValue() - guard src.tryEat(sequence: ":]") else { return nil } + let (key, value) = p.lexCharacterPropertyKeyValue() + guard p.tryEat(sequence: ":]") else { return nil } - let prop = try Source.classifyCharacterPropertyContents(key: key, - value: value) + let prop = p.classifyCharacterPropertyContents(key: key, value: value) return .init(prop, isInverted: inverted, isPOSIX: true) } } } - private func canLexPOSIXCharacterProperty() -> Bool { - do { - return try lookahead { src in - try src.lexPOSIXCharacterProperty() != nil - } - } catch { - // We want to tend on the side of lexing a POSIX character property, so - // even if it is invalid in some way (e.g invalid property names), still - // try and lex it. - return true - } + private mutating func canLexPOSIXCharacterProperty() -> Bool { + lookahead { $0.lexPOSIXCharacterProperty() != nil } } /// Try to consume a named character. @@ -1303,26 +1324,26 @@ extension Source { /// NamedCharacter -> '\N{' CharName '}' /// CharName -> 'U+' HexDigit{1...8} | [\s\w-]+ /// - private mutating func lexNamedCharacter() throws -> Located? { - try recordLoc { src in - guard src.tryEat(sequence: "N{") else { return nil } + private mutating func lexNamedCharacter() -> Located? { + recordLoc { p in + guard p.tryEat(sequence: "N{") else { return nil } // We should either have a unicode scalar. - if src.tryEat(sequence: "U+") { - let str = try src.lexUntil(eating: "}") - return .scalar(try Source.validateUnicodeScalar(str, .hex)) + if p.tryEat(sequence: "U+") { + let str = p.lexUntil(eating: "}") + return .scalar(p.validateUnicodeScalar(str, .hex)) } // Or we should have a character name. // TODO: Validate the types of characters that can appear in the name? - return .namedCharacter(try src.lexUntil(eating: "}").value) + return .namedCharacter(p.lexUntil(eating: "}").value) } } private mutating func lexCharacterPropertyKeyValue( - ) -> (key: String?, value: String) { - func atPossibleEnding(_ src: inout Source) -> Bool { - guard let next = src.peek() else { return true } + ) -> (key: Located?, value: Located) { + func atPossibleEnding(_ p: inout Self) -> Bool { + guard let next = p.peek() else { return true } switch next { case "=": // End of a key. @@ -1358,21 +1379,21 @@ extension Source { // - 'x=y' where 'x' is a property key, and 'y' is a value. // - 'y' where 'y' is a value (or a bool key with an inferred value of true) // and its key is inferred. - let lhs = lexUntil(atPossibleEnding).value + let lhs = lexUntil(atPossibleEnding) if tryEat("=") { - let rhs = lexUntil(atPossibleEnding).value + let rhs = lexUntil(atPossibleEnding) return (lhs, rhs) } return (nil, lhs) } - private static func classifyCharacterPropertyContents( - key: String?, value: String - ) throws -> AST.Atom.CharacterProperty.Kind { + private mutating func classifyCharacterPropertyContents( + key: Located?, value: Located + ) -> AST.Atom.CharacterProperty.Kind { if let key = key { - return try classifyCharacterProperty(key: key, value: value) + return classifyCharacterProperty(key: key, value: value) } - return try classifyCharacterPropertyValueOnly(value) + return classifyCharacterPropertyValueOnly(value) } /// Try to consume a character property. @@ -1381,17 +1402,18 @@ extension Source { /// Prop -> [\s\w-]+ /// private mutating func lexCharacterProperty( - ) throws -> Located? { - try recordLoc { src in + ) -> Located? { + recordLoc { p in // '\P{...}' is the inverted version of '\p{...}' - guard src.starts(with: "p{") || src.starts(with: "P{") else { return nil } - let isInverted = src.peek() == "P" - src.advance(2) - - let (key, value) = src.lexCharacterPropertyKeyValue() - let prop = try Source.classifyCharacterPropertyContents(key: key, - value: value) - try src.expect("}") + guard p.src.starts(with: "p{") || p.src.starts(with: "P{") else { + return nil + } + let isInverted = p.peek() == "P" + p.advance(2) + + let (key, value) = p.lexCharacterPropertyKeyValue() + let prop = p.classifyCharacterPropertyContents(key: key, value: value) + p.expect("}") return .init(prop, isInverted: isInverted, isPOSIX: false) } } @@ -1402,27 +1424,28 @@ extension Source { /// private mutating func lexNumberedReference( allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false - ) throws -> AST.Reference? { - let kind = try recordLoc { src -> AST.Reference.Kind? in - try src.tryEating { src in + ) -> AST.Reference? { + let kind = recordLoc { p -> AST.Reference.Kind? in + p.tryEating { p in // Note this logic should match canLexNumberedReference. - if src.tryEat("+"), let num = try src.lexNumber() { - return .relative(num.value) + if let plus = p.tryEatWithLoc("+"), let num = p.lexNumber() { + return .relative(.init(num.value, at: num.location.union(with: plus))) } - if src.tryEat("-"), let num = try src.lexNumber() { - return .relative(-num.value) + if let minus = p.tryEatWithLoc("-"), let num = p.lexNumber() { + let val = num.value.map { x in -x } + return .relative(.init(val, at: num.location.union(with: minus))) } - if let num = try src.lexNumber() { - return .absolute(num.value) + if let num = p.lexNumber() { + return .absolute(num) } return nil } } guard let kind = kind else { return nil } - guard allowWholePatternRef || kind.value != .recurseWholePattern else { - throw ParseError.cannotReferToWholePattern + if !allowWholePatternRef && kind.value.recursesWholePattern { + error(.cannotReferToWholePattern, at: kind.location) } - let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil + let recLevel = allowRecursionLevel ? lexRecursionLevel() : nil let loc = recLevel?.location.union(with: kind.location) ?? kind.location return .init(kind.value, recursionLevel: recLevel, innerLoc: loc) } @@ -1432,19 +1455,21 @@ extension Source { /// RecursionLevel -> '+' | '-' /// private mutating func lexRecursionLevel( - ) throws -> Located? { - try recordLoc { src in - if src.tryEat("+") { return try src.expectNumber().value } - if src.tryEat("-") { return try -src.expectNumber().value } + ) -> AST.Atom.Number? { + let value = recordLoc { p -> Int? in + if p.tryEat("+") { return p.expectNumber().value } + if p.tryEat("-") { return p.expectNumber().value.map { x in -x } } return nil } + guard let value = value else { return nil } + return .init(value.value, at: value.location) } /// Checks whether a numbered reference can be lexed. - private func canLexNumberedReference() -> Bool { - lookahead { src in - _ = src.tryEat(anyOf: "+", "-") - guard let next = src.peek() else { return false } + private mutating func canLexNumberedReference() -> Bool { + lookahead { p in + _ = p.tryEat(anyOf: "+", "-") + guard let next = p.peek() else { return false } return RadixKind.decimal.characterFilter(next) } } @@ -1453,18 +1478,18 @@ extension Source { private mutating func expectNamedReference( endingWith end: String, eatEnding: Bool = true, allowRecursionLevel: Bool = false - ) throws -> AST.Reference { + ) -> AST.Reference { // Note we don't want to eat the ending as we may also want to parse a // recursion level. - let str = try expectIdentifier( + let str = expectIdentifier( .groupName, endingWith: end, eatEnding: false) - // If we're allowed to, try parse a recursion level. - let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil + // If we're allowed to, parse a recursion level. + let recLevel = allowRecursionLevel ? lexRecursionLevel() : nil let loc = recLevel?.location.union(with: str.location) ?? str.location if eatEnding { - try expect(sequence: end) + expect(sequence: end) } return .init(.named(str.value), recursionLevel: recLevel, innerLoc: loc) } @@ -1475,8 +1500,8 @@ extension Source { endingWith end: String, eatEnding: Bool = true, allowRecursionLevel: Bool = false ) -> AST.Reference? { - tryEating { src in - try? src.expectNamedReference( + tryEating { p in + p.expectNamedReference( endingWith: end, eatEnding: eatEnding, allowRecursionLevel: allowRecursionLevel ) @@ -1490,32 +1515,34 @@ extension Source { private mutating func expectNamedOrNumberedReference( endingWith ending: String, eatEnding: Bool = true, allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false - ) throws -> AST.Reference { - let num = try lexNumberedReference( + ) -> AST.Reference { + let num = lexNumberedReference( allowWholePatternRef: allowWholePatternRef, allowRecursionLevel: allowRecursionLevel ) if let num = num { if eatEnding { - try expect(sequence: ending) + expect(sequence: ending) } return num } - return try expectNamedReference( + return expectNamedReference( endingWith: ending, eatEnding: eatEnding, allowRecursionLevel: allowRecursionLevel ) } - private static func getClosingDelimiter( + private mutating func getClosingDelimiter( for openChar: Character ) -> Character { switch openChar { // Identically-balanced delimiters. - case "'", "\"", "`", "^", "%", "#", "$": return openChar - case "<": return ">" - case "{": return "}" - default: fatalError("Not implemented") + case "'", "\"", "`", "^", "%", "#", "$": return openChar + case "<": return ">" + case "{": return "}" + default: + unreachable("Unhandled case") + return openChar } } @@ -1530,58 +1557,54 @@ extension Source { /// | 'k{' '}' /// | [1-9] [0-9]+ /// - private mutating func lexEscapedReference( - context: ParsingContext - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard let firstChar = src.peek() else { return nil } + private mutating func lexEscapedReference() -> Located? { + recordLoc { p in + p.tryEating { p in + guard let firstChar = p.peek() else { return nil } - if src.tryEat("g") { + if p.tryEat("g") { // PCRE-style backreferences. - if src.tryEat("{") { - let ref = try src.expectNamedOrNumberedReference(endingWith: "}") + if p.tryEat("{") { + let ref = p.expectNamedOrNumberedReference(endingWith: "}") return .backreference(ref) } // Oniguruma-style subpatterns. - if let openChar = src.tryEat(anyOf: "<", "'") { - let closing = String(Source.getClosingDelimiter(for: openChar)) - return .subpattern(try src.expectNamedOrNumberedReference( + if let openChar = p.tryEat(anyOf: "<", "'") { + let closing = String(p.getClosingDelimiter(for: openChar)) + return .subpattern(p.expectNamedOrNumberedReference( endingWith: closing, allowWholePatternRef: true)) } // PCRE allows \g followed by a bare numeric reference. - if let ref = try src.lexNumberedReference() { + if let ref = p.lexNumberedReference() { return .backreference(ref) } return nil } - if src.tryEat("k") { + if p.tryEat("k") { // Perl/.NET/Oniguruma-style backreferences. - if let openChar = src.tryEat(anyOf: "<", "'") { - let closing = String(Source.getClosingDelimiter(for: openChar)) + if let openChar = p.tryEat(anyOf: "<", "'") { + let closing = String(p.getClosingDelimiter(for: openChar)) // Perl only accept named references here, but Oniguruma and .NET // also accepts numbered references. This shouldn't be an ambiguity // as named references may not begin with a digit, '-', or '+'. // Oniguruma also allows a recursion level to be specified. - return .backreference(try src.expectNamedOrNumberedReference( + return .backreference(p.expectNamedOrNumberedReference( endingWith: closing, allowRecursionLevel: true)) } // Perl/.NET also allow a named references with the '{' delimiter. - if src.tryEat("{") { - return .backreference( - try src.expectNamedReference(endingWith: "}")) + if p.tryEat("{") { + return .backreference(p.expectNamedReference(endingWith: "}")) } return nil } // Backslash followed by a non-0 digit character is a backreference. - if firstChar != "0", let numAndLoc = try src.lexNumber() { - return .backreference(.init( - .absolute(numAndLoc.value), innerLoc: numAndLoc.location)) + if firstChar != "0", let num = p.lexNumber() { + return .backreference(.init(.absolute(num), innerLoc: num.location)) } return nil } @@ -1598,35 +1621,35 @@ extension Source { /// | NumberRef /// private mutating func lexGroupLikeReference( - ) throws -> Located? { - try recordLoc { src in - try src.tryEating { src in - guard src.tryEat(sequence: "(?") else { return nil } + ) -> Located? { + recordLoc { p in + p.tryEating { p in + guard p.tryEat(sequence: "(?") else { return nil } // Note the below should be covered by canLexGroupLikeReference. // Python-style references. - if src.tryEat(sequence: "P=") { - return .backreference(try src.expectNamedReference(endingWith: ")")) + if p.tryEat(sequence: "P=") { + return .backreference(p.expectNamedReference(endingWith: ")")) } - if src.tryEat(sequence: "P>") { - return .subpattern(try src.expectNamedReference(endingWith: ")")) + if p.tryEat(sequence: "P>") { + return .subpattern(p.expectNamedReference(endingWith: ")")) } // Perl-style subpatterns. - if src.tryEat("&") { - return .subpattern(try src.expectNamedReference(endingWith: ")")) + if p.tryEat("&") { + return .subpattern(p.expectNamedReference(endingWith: ")")) } // Whole-pattern recursion, which is equivalent to (?0). - if let loc = src.tryEatWithLoc("R") { - try src.expect(")") - return .subpattern(.init(.recurseWholePattern, innerLoc: loc)) + if let loc = p.tryEatWithLoc("R") { + p.expect(")") + return .subpattern(.init(.recurseWholePattern(loc), innerLoc: loc)) } // Numbered subpattern reference. - if let ref = try src.lexNumberedReference(allowWholePatternRef: true) { - try src.expect(")") + if let ref = p.lexNumberedReference(allowWholePatternRef: true) { + p.expect(")") return .subpattern(ref) } return nil @@ -1635,53 +1658,51 @@ extension Source { } /// Whether we can lex a group-like reference after the specifier '(?'. - private func canLexGroupLikeReference() -> Bool { - lookahead { src in - if src.tryEat("P") { - return src.tryEat(anyOf: "=", ">") != nil + private mutating func canLexGroupLikeReference() -> Bool { + lookahead { p in + if p.tryEat("P") { + return p.tryEat(anyOf: "=", ">") != nil } - if src.tryEat(anyOf: "&", "R") != nil { + if p.tryEat(anyOf: "&", "R") != nil { return true } - return src.canLexNumberedReference() + return p.canLexNumberedReference() } } - private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { - lookahead { src in + private mutating func canLexMatchingOptionsAsAtom() -> Bool { + lookahead { p in // See if we can lex a matching option sequence that terminates in ')'. - // Such a sequence is an atom. If an error is thrown, there are invalid - // elements of the matching option sequence. In such a case, we can lex as - // a group and diagnose the invalid group kind. - guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + // Such a sequence is an atom. + guard p.lexMatchingOptionSequence() != nil else { return false } - return src.tryEat(")") + return p.tryEat(")") } } /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { - lookahead { src in - guard src.tryEat("(") else { return false } + private mutating func shouldLexGroupLikeAtom() -> Bool { + lookahead { p in + guard p.tryEat("(") else { return false } - if src.tryEat("?") { + if p.tryEat("?") { // The start of a reference '(?P=', '(?R', ... - if src.canLexGroupLikeReference() { return true } + if p.canLexGroupLikeReference() { return true } // The start of a PCRE callout. - if src.tryEat("C") { return true } + if p.tryEat("C") { return true } // The start of an Oniguruma 'of-contents' callout. - if src.tryEat("{") { return true } + if p.tryEat("{") { return true } // A matching option atom (?x), (?i), ... - if src.canLexMatchingOptionsAsAtom(context: context) { return true } + if p.canLexMatchingOptionsAsAtom() { return true } return false } // The start of a backreference directive or Oniguruma named callout. - if src.tryEat("*") { return true } + if p.tryEat("*") { return true } return false } @@ -1693,47 +1714,50 @@ extension Source { /// | UniScalar | Property | NamedCharacter /// | EscapedReference /// - mutating func expectEscaped( - context: ParsingContext - ) throws -> Located { - try recordLoc { src in - let ccc = context.isInCustomCharacterClass + mutating func expectEscaped() -> Located { + recordLoc { p in + let ccc = p.context.isInCustomCharacterClass // Keyboard control/meta - if src.tryEat("c") || src.tryEat(sequence: "C-") { - return .keyboardControl(try src.expectASCII().value) + if p.tryEat("c") || p.tryEat(sequence: "C-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardControl(ascii.value) } - if src.tryEat(sequence: "M-\\C-") { - return .keyboardMetaControl(try src.expectASCII().value) + if p.tryEat(sequence: "M-\\C-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardMetaControl(ascii.value) } - if src.tryEat(sequence: "M-") { - return .keyboardMeta(try src.expectASCII().value) + if p.tryEat(sequence: "M-") { + guard let ascii = p.expectASCII() else { return .invalid } + return .keyboardMeta(ascii.value) } // Named character '\N{...}'. - if let char = try src.lexNamedCharacter() { + if let char = p.lexNamedCharacter() { return char.value } // Character property \p{...} \P{...}. - if let prop = try src.lexCharacterProperty() { + if let prop = p.lexCharacterProperty() { return .property(prop.value) } // References using escape syntax, e.g \1, \g{1}, \k<...>, ... // These are not valid inside custom character classes. - if !ccc, let ref = try src.lexEscapedReference(context: context)?.value { + if !ccc, let ref = p.lexEscapedReference()?.value { return ref } // Hexadecimal and octal unicode scalars. - if let scalar = try src.lexUnicodeScalar() { + if let scalar = p.lexUnicodeScalar() { return scalar } - guard let char = src.tryEat() else { - throw ParseError.expectedEscape + guard let charLoc = p.tryEatWithLoc() else { + p.errorAtCurrentPosition(.expectedEscape) + return .invalid } + let char = charLoc.value // Single-character builtins. if let builtin = AST.Atom.EscapedBuiltin( @@ -1745,10 +1769,9 @@ extension Source { // We only allow unknown escape sequences for non-letter non-number ASCII, // and non-ASCII whitespace. // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`. - guard (char.isASCII && !char.isLetter && !char.isNumber) || - (!char.isASCII && char.isWhitespace) - else { - throw ParseError.invalidEscape(char) + if (char.isASCII && (char.isLetter || char.isNumber)) || + (!char.isASCII && !char.isWhitespace) { + p.error(.invalidEscape(char), at: charLoc.location) } return .char(char) } @@ -1767,32 +1790,34 @@ extension Source { /// | '$' '$' /// | '{' '}' /// - mutating func lexPCRECallout() throws -> AST.Atom.Callout? { + mutating func lexPCRECallout() -> AST.Atom.Callout? { guard tryEat(sequence: "(?C") else { return nil } - let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in + let arg = recordLoc { p -> AST.Atom.Callout.PCRE.Argument in // Parse '(?C' followed by a number. - if let num = try src.lexNumber() { - return .number(num.value) + if let num = p.lexNumber() { + return .number(num) } // '(?C)' is implicitly '(?C0)'. - if src.peek() == ")" { - return .number(0) + if p.peek() == ")" { + return .number(.init(0, at: p.loc(p.src.currentPosition))) } // Parse '(C?' followed by a set of balanced delimiters as defined by // http://pcre.org/current/doc/html/pcre2pattern.html#SEC28 - if let open = src.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") { - let closing = String(Source.getClosingDelimiter(for: open)) - return .string(try src.expectQuoted(endingWith: closing).value) + if let open = p.tryEat(anyOf: "`", "'", "\"", "^", "%", "#", "$", "{") { + let closing = String(p.getClosingDelimiter(for: open)) + return .string(p.expectQuoted(endingWith: closing).value) } // If we don't know what this syntax is, consume up to the ending ')' and // emit an error. - let remaining = src.lexUntil { $0.isEmpty || $0.tryEat(")") }.value - if remaining.isEmpty { - throw ParseError.expected(")") + let remaining = p.lexUntil { $0.src.isEmpty || $0.peek() == ")" } + if p.src.isEmpty && remaining.value.isEmpty { + p.errorAtCurrentPosition(.expected(")")) + } else { + p.error(.unknownCalloutKind("(?C\(remaining.value))"), at: remaining.location) } - throw ParseError.unknownCalloutKind("(?C\(remaining))") + return .string(remaining.value) } - try expect(")") + expect(")") return .pcre(.init(arg)) } @@ -1803,22 +1828,24 @@ extension Source { /// mutating func expectOnigurumaCalloutArgList( leftBrace: SourceLocation - ) throws -> AST.Atom.Callout.OnigurumaNamed.ArgList { + ) -> AST.Atom.Callout.OnigurumaNamed.ArgList { var args: [Located] = [] while true { - let arg = try recordLoc { src -> String in + let arg = recordLoc { p -> String? in // TODO: Warn about whitespace being included? - guard let arg = src.tryEatPrefix({ $0 != "," && $0 != "}" }) else { - throw ParseError.expectedCalloutArgument + guard let arg = p.tryEatPrefix({ $0 != "," && $0 != "}" }) else { + p.errorAtCurrentPosition(.expectedCalloutArgument) + return nil } - return arg.string + return arg.value } - args.append(arg) - - if peek() == "}" { break } - try expect(",") + if let arg = arg { + args.append(arg) + } + if src.isEmpty || peek() == "}" { break } + expect(",") } - let rightBrace = try expect("}") + let rightBrace = expectWithLoc("}").location return .init(leftBrace, args, rightBrace) } @@ -1827,12 +1854,12 @@ extension Source { /// OnigurumaTag -> '[' Identifier ']' /// mutating func lexOnigurumaCalloutTag( - ) throws -> AST.Atom.Callout.OnigurumaTag? { + ) -> AST.Atom.Callout.OnigurumaTag? { guard let leftBracket = tryEatWithLoc("[") else { return nil } - let name = try expectIdentifier( + let name = expectIdentifier( .onigurumaCalloutTag, endingWith: "]", eatEnding: false ) - let rightBracket = try expect("]") + let rightBracket = expectWithLoc("]").location return .init(leftBracket, name, rightBracket) } @@ -1841,19 +1868,18 @@ extension Source { /// OnigurumaNamedCallout -> '(*' Identifier OnigurumaTag? Args? ')' /// Args -> '{' OnigurumaCalloutArgList '}' /// - mutating func lexOnigurumaNamedCallout() throws -> AST.Atom.Callout? { - try tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } - guard let name = src.lexIdentifier( + mutating func lexOnigurumaNamedCallout() -> AST.Atom.Callout? { + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } + let name = p.expectIdentifier( .onigurumaCalloutName, endingWith: ")", eatEnding: false) - else { return nil } - let tag = try src.lexOnigurumaCalloutTag() + let tag = p.lexOnigurumaCalloutTag() - let args = try src.tryEatWithLoc("{").map { - try src.expectOnigurumaCalloutArgList(leftBrace: $0) + let args = p.tryEatWithLoc("{").map { + p.expectOnigurumaCalloutArgList(leftBrace: $0) } - try src.expect(")") + p.expect(")") return .onigurumaNamed(.init(name, tag: tag, args: args)) } } @@ -1864,32 +1890,33 @@ extension Source { /// Contents -> /// Direction -> 'X' | '<' | '>' /// - mutating func lexOnigurumaCalloutOfContents() throws -> AST.Atom.Callout? { - try tryEating { src in - guard src.tryEat(sequence: "(?"), - let openBraces = src.tryEatPrefix({ $0 == "{" }) + mutating func lexOnigurumaCalloutOfContents() -> AST.Atom.Callout? { + tryEating { p in + guard p.tryEat(sequence: "(?"), + let openBraces = p.tryEatPrefix({ $0 == "{" }) else { return nil } - let contents = try src.expectQuoted( - endingWith: "}", count: openBraces.count) + let contents = p.expectQuoted( + endingWith: "}", count: openBraces.value.count) let closeBraces = SourceLocation( - contents.location.end ..< src.currentPosition) + contents.location.end ..< p.src.currentPosition) - let tag = try src.lexOnigurumaCalloutTag() + let tag = p.lexOnigurumaCalloutTag() typealias Direction = AST.Atom.Callout.OnigurumaOfContents.Direction - let direction = src.recordLoc { src -> Direction in - if src.tryEat(">") { return .inProgress } - if src.tryEat("<") { return .inRetraction } - if src.tryEat("X") { return .both } + let direction = p.recordLoc { p -> Direction in + if p.tryEat(">") { return .inProgress } + if p.tryEat("<") { return .inRetraction } + if p.tryEat("X") { return .both } // The default is in-progress. return .inProgress } - try src.expect(")") + p.expect(")") - let openBracesLoc = SourceLocation(from: openBraces) return .onigurumaOfContents(.init( - openBracesLoc, contents, closeBraces, tag: tag, direction: direction)) + openBraces.location, contents, closeBraces, tag: tag, + direction: direction + )) } } @@ -1900,94 +1927,93 @@ extension Source { /// | 'COMMIT' | 'PRUNE' | 'SKIP' | 'THEN' /// mutating func lexBacktrackingDirective( - ) throws -> AST.Atom.BacktrackingDirective? { - try tryEating { src in - guard src.tryEat(sequence: "(*") else { return nil } - let kind = src.recordLoc { src -> AST.Atom.BacktrackingDirective.Kind? in - if src.tryEat(sequence: "ACCEPT") { return .accept } - if src.tryEat(sequence: "FAIL") || src.tryEat("F") { return .fail } - if src.tryEat(sequence: "MARK") || src.peek() == ":" { return .mark } - if src.tryEat(sequence: "COMMIT") { return .commit } - if src.tryEat(sequence: "PRUNE") { return .prune } - if src.tryEat(sequence: "SKIP") { return .skip } - if src.tryEat(sequence: "THEN") { return .then } + ) -> AST.Atom.BacktrackingDirective? { + tryEating { p in + guard p.tryEat(sequence: "(*") else { return nil } + let kind = p.recordLoc { p -> AST.Atom.BacktrackingDirective.Kind? in + if p.tryEat(sequence: "ACCEPT") { return .accept } + if p.tryEat(sequence: "FAIL") || p.tryEat("F") { return .fail } + if p.tryEat(sequence: "MARK") || p.peek() == ":" { return .mark } + if p.tryEat(sequence: "COMMIT") { return .commit } + if p.tryEat(sequence: "PRUNE") { return .prune } + if p.tryEat(sequence: "SKIP") { return .skip } + if p.tryEat(sequence: "THEN") { return .then } return nil } guard let kind = kind else { return nil } var name: Located? - if src.tryEat(":") { + if p.tryEat(":") { // TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the // name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x). - name = try src.expectQuoted(endingWith: ")", eatEnding: false) + name = p.expectQuoted(endingWith: ")", eatEnding: false) } - try src.expect(")") + p.expect(")") // MARK directives must be named. if name == nil && kind.value == .mark { - throw ParseError.backtrackingDirectiveMustHaveName( - String(src[kind.location.range])) + let kindStr = String(p.src[kind.location.range]) + p.error(.backtrackingDirectiveMustHaveName(kindStr), at: kind.location) } return .init(kind, name: name) } } - /// Consume a group-like atom, throwing an error if an atom could not be + /// Consume a group-like atom, diagnosing an error if an atom could not be /// produced. /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom( - context: ParsingContext - ) throws -> AST.Atom.Kind { - try recordLoc { src in - // References that look like groups, e.g (?R), (?1), ... - if let ref = try src.lexGroupLikeReference() { - return ref.value - } + mutating func expectGroupLikeAtom() -> AST.Atom.Kind { + // References that look like groups, e.g (?R), (?1), ... + if let ref = lexGroupLikeReference() { + return ref.value + } - // Change matching options atom (?i), (?x-i), ... - if let seq = try src.lexChangeMatchingOptionAtom(context: context) { - return .changeMatchingOptions(seq) - } + // Change matching options atom (?i), (?x-i), ... + if let seq = lexChangeMatchingOptionAtom() { + return .changeMatchingOptions(seq) + } - // (*ACCEPT), (*FAIL), (*MARK), ... - if let b = try src.lexBacktrackingDirective() { - return .backtrackingDirective(b) - } + // (*ACCEPT), (*FAIL), (*MARK), ... + if let b = lexBacktrackingDirective() { + return .backtrackingDirective(b) + } - // Global matching options can only appear at the very start. - if let opt = try src.lexGlobalMatchingOption() { - throw ParseError.globalMatchingOptionNotAtStart( - String(src[opt.location.range])) - } + // Global matching options can only appear at the very start. + if let opt = lexGlobalMatchingOption() { + let optStr = String(src[opt.location.range]) + error(.globalMatchingOptionNotAtStart(optStr), at: opt.location) + return .invalid + } - // (?C) - if let callout = try src.lexPCRECallout() { - return .callout(callout) - } + // (?C) + if let callout = lexPCRECallout() { + return .callout(callout) + } - // Try to consume an Oniguruma named callout '(*name)', which should be - // done after backtracking directives and global options. - if let callout = try src.lexOnigurumaNamedCallout() { - return .callout(callout) - } + // Try to consume an Oniguruma named callout '(*name)', which should be + // done after backtracking directives and global options. + if let callout = lexOnigurumaNamedCallout() { + return .callout(callout) + } - // (?{...}) - if let callout = try src.lexOnigurumaCalloutOfContents() { - return .callout(callout) - } + // (?{...}) + if let callout = lexOnigurumaCalloutOfContents() { + return .callout(callout) + } - // If we didn't produce an atom, consume up until a reasonable end-point - // and throw an error. - try src.expect("(") - let remaining = src.lexUntil { - $0.isEmpty || $0.tryEat(anyOf: ":", ")") != nil - }.value - if remaining.isEmpty { - throw ParseError.expected(")") - } - throw ParseError.unknownGroupKind(remaining) - }.value + // If we didn't produce an atom, consume up until a reasonable end-point + // and diagnose an error. + expect("(") + let remaining = lexUntil { + $0.src.isEmpty || $0.tryEat(anyOf: ":", ")") != nil + } + if remaining.value.isEmpty { + error(.expected(")"), at: remaining.location) + } else { + error(.unknownGroupKind(remaining.value), at: remaining.location) + } + return .invalid } @@ -2002,43 +2028,49 @@ extension Source { /// /// ExpGroupStart -> '(_:' /// - mutating func lexAtom(context: ParsingContext) throws -> AST.Atom? { + mutating func lexAtom() -> AST.Atom? { let customCC = context.isInCustomCharacterClass - let kind: Located? = try recordLoc { src in + let kind = recordLoc { p -> AST.Atom.Kind? in // Check for not-an-atom, e.g. parser recursion termination - if src.isEmpty { return nil } - if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } + if p.src.isEmpty { return nil } + if !customCC && (p.peek() == ")" || p.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful // POSIX character property. Like \p{...} this is also allowed outside of // a custom character class. - if let prop = try src.lexPOSIXCharacterProperty()?.value { + if let prop = p.lexPOSIXCharacterProperty()?.value { return .property(prop) } // If we have group syntax that was skipped over in lexGroupStart, we - // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom(context: context) { - return try src.expectGroupLikeAtom(context: context) + // need to handle it as an atom, or diagnose an error. + if !customCC && p.shouldLexGroupLikeAtom() { + return p.expectGroupLikeAtom() } // A quantifier here is invalid. - if !customCC, - let q = try src.recordLoc({ try $0.lexQuantifier(context: context) }) { - throw ParseError.quantifierRequiresOperand( - String(src[q.location.range])) + if !customCC, let q = p.recordLoc({ $0.lexQuantifier() }) { + let str = String(p.src[q.location.range]) + p.error(.quantifierRequiresOperand(str), at: q.location) + return .invalid } - let char = src.eat() + guard let charLoc = p.tryEatWithLoc() else { + // We check at the beginning of the function for `isEmpty`, so we should + // not be at the end of the input here. + p.unreachable("Unexpected end of input") + return nil + } + let char = charLoc.value switch char { case ")", "|": if customCC { return .char(char) } - throw Unreachable("TODO: reason") + p.unreachable("Is as a termination condition") case "(" where !customCC: - throw Unreachable("Should have lexed a group or group-like atom") + p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .any @@ -2046,7 +2078,7 @@ extension Source { case "$": return customCC ? .char("$") : .endOfLine // Escaped - case "\\": return try src.expectEscaped(context: context).value + case "\\": return p.expectEscaped().value case "]": assert(!customCC, "parser should have prevented this") @@ -2060,7 +2092,7 @@ extension Source { let scalars = char.unicodeScalars if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" && !char.isLetter && !char.isNumber { - throw ParseError.confusableCharacter(char) + p.error(.confusableCharacter(char), at: charLoc.location) } break } @@ -2082,7 +2114,7 @@ extension Source { /// NewlineSequenceKind -> 'BSR_ANYCRLF' | 'BSR_UNICODE' /// private mutating func lexNewlineSequenceMatchingOption( - ) throws -> AST.GlobalMatchingOption.NewlineSequenceMatching? { + ) -> AST.GlobalMatchingOption.NewlineSequenceMatching? { if tryEat(sequence: "BSR_ANYCRLF") { return .anyCarriageReturnOrLinefeed } if tryEat(sequence: "BSR_UNICODE") { return .anyUnicode } return nil @@ -2093,7 +2125,7 @@ extension Source { /// NewlineKind -> 'CRLF' | 'CR' | 'ANYCRLF' | 'ANY' | 'LF' | 'NUL' /// private mutating func lexNewlineMatchingOption( - ) throws -> AST.GlobalMatchingOption.NewlineMatching? { + ) -> AST.GlobalMatchingOption.NewlineMatching? { // The ordering here is important: CRLF needs to precede CR, and ANYCRLF // needs to precede ANY to ensure we don't short circuit on the wrong one. if tryEat(sequence: "CRLF") { return .carriageAndLinefeedOnly } @@ -2119,38 +2151,38 @@ extension Source { /// | 'LIMIT_MATCH' /// private mutating func lexGlobalMatchingOptionKind( - ) throws -> Located? { - try recordLoc { src in - if let opt = try src.lexNewlineSequenceMatchingOption() { + ) -> Located? { + recordLoc { p in + if let opt = p.lexNewlineSequenceMatchingOption() { return .newlineSequenceMatching(opt) } - if let opt = try src.lexNewlineMatchingOption() { + if let opt = p.lexNewlineMatchingOption() { return .newlineMatching(opt) } - if src.tryEat(sequence: "LIMIT_DEPTH") { - try src.expect("=") - return .limitDepth(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_DEPTH") { + p.expect("=") + return .limitDepth(p.expectNumber()) } - if src.tryEat(sequence: "LIMIT_HEAP") { - try src.expect("=") - return .limitHeap(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_HEAP") { + p.expect("=") + return .limitHeap(p.expectNumber()) } - if src.tryEat(sequence: "LIMIT_MATCH") { - try src.expect("=") - return .limitMatch(try src.expectNumber()) + if p.tryEat(sequence: "LIMIT_MATCH") { + p.expect("=") + return .limitMatch(p.expectNumber()) } // The ordering here is important: NOTEMPTY_ATSTART needs to precede // NOTEMPTY to ensure we don't short circuit on the wrong one. - if src.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart } - if src.tryEat(sequence: "NOTEMPTY") { return .notEmpty } - - if src.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess } - if src.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor } - if src.tryEat(sequence: "NO_JIT") { return .noJIT } - if src.tryEat(sequence: "NO_START_OPT") { return .noStartOpt } - if src.tryEat(sequence: "UTF") { return .utfMode } - if src.tryEat(sequence: "UCP") { return .unicodeProperties } + if p.tryEat(sequence: "NOTEMPTY_ATSTART") { return .notEmptyAtStart } + if p.tryEat(sequence: "NOTEMPTY") { return .notEmpty } + + if p.tryEat(sequence: "NO_AUTO_POSSESS") { return .noAutoPossess } + if p.tryEat(sequence: "NO_DOTSTAR_ANCHOR") { return .noDotStarAnchor } + if p.tryEat(sequence: "NO_JIT") { return .noJIT } + if p.tryEat(sequence: "NO_START_OPT") { return .noStartOpt } + if p.tryEat(sequence: "UTF") { return .utfMode } + if p.tryEat(sequence: "UCP") { return .unicodeProperties } return nil } } @@ -2160,13 +2192,13 @@ extension Source { /// GlobalMatchingOption -> '(*' GlobalMatchingOptionKind ')' /// mutating func lexGlobalMatchingOption( - ) throws -> AST.GlobalMatchingOption? { - let kind = try recordLoc { src -> AST.GlobalMatchingOption.Kind? in - try src.tryEating { src in - guard src.tryEat(sequence: "(*"), - let kind = try src.lexGlobalMatchingOptionKind()?.value + ) -> AST.GlobalMatchingOption? { + let kind = recordLoc { p -> AST.GlobalMatchingOption.Kind? in + p.tryEating { p in + guard p.tryEat(sequence: "(*"), + let kind = p.lexGlobalMatchingOptionKind()?.value else { return nil } - try src.expect(")") + p.expect(")") return kind } } @@ -2179,9 +2211,9 @@ extension Source { /// GlobalMatchingOptionSequence -> GlobalMatchingOption+ /// mutating func lexGlobalMatchingOptionSequence( - ) throws -> AST.GlobalMatchingOptionSequence? { + ) -> AST.GlobalMatchingOptionSequence? { var opts: [AST.GlobalMatchingOption] = [] - while let opt = try lexGlobalMatchingOption() { + while let opt = lexGlobalMatchingOption() { opts.append(opt) } return .init(opts) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 52861f23d..0aae031d5 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -96,8 +96,10 @@ struct ParsingContext { func isPriorGroupRef(_ ref: AST.Reference.Kind) -> Bool { switch ref { case .absolute(let i): + guard let i = i.value else { return false } return i <= priorGroupCount case .relative(let i): + guard let i = i.value else { return false } return i < 0 case .named(let str): return usedGroupNames.contains(str) @@ -105,12 +107,13 @@ struct ParsingContext { } } -private struct Parser { - var source: Source +struct Parser { + var src: Source var context: ParsingContext + var diags = Diagnostics() - init(_ source: Source, syntax: SyntaxOptions) { - self.source = source + init(_ src: Source, syntax: SyntaxOptions) { + self.src = src self.context = ParsingContext(syntax: syntax) } } @@ -126,10 +129,20 @@ extension ParsingContext { // Diagnostics extension Parser { - fileprivate func loc( + func loc( _ start: Source.Position ) -> SourceLocation { - SourceLocation(start ..< source.currentPosition) + SourceLocation(start ..< src.currentPosition) + } + + mutating func error(_ err: ParseError, at loc: SourceLocation) { + diags.error(err, at: loc) + } + mutating func errorAtCurrentPosition(_ err: ParseError) { + diags.error(err, at: loc(src.currentPosition)) + } + mutating func unreachable(_ err: String) { + diags.fatal(.unreachable(err), at: loc(src.currentPosition)) } } @@ -139,9 +152,9 @@ extension Parser { /// /// Regex -> GlobalMatchingOptionSequence? RegexNode /// - mutating func parse() throws -> AST { + mutating func parse() -> AST { // First parse any global matching options if present. - let opts = try source.lexGlobalMatchingOptionSequence() + let opts = lexGlobalMatchingOptionSequence() // If we have a newline mode global option, update the context accordingly. if let opts = opts { @@ -153,17 +166,19 @@ extension Parser { } // Then parse the root AST node. - let ast = try parseNode() - guard source.isEmpty else { + let ast = parseNode() + if !src.isEmpty { // parseConcatenation() terminates on encountering a ')' to enable // recursive parses of a group body. However for a top-level parse, this // means we have an unmatched closing paren, so let's diagnose. - if let loc = source.tryEatWithLoc(")") { - throw Source.LocatedError(ParseError.unbalancedEndOfGroup, loc) + // TODO: We should continue to parse for better recovery. + if let loc = tryEatWithLoc(")") { + error(.unbalancedEndOfGroup, at: loc) + } else { + unreachable("Unhandled termination condition") } - fatalError("Unhandled termination condition") } - return .init(ast, globalOptions: opts) + return .init(ast, globalOptions: opts, diags: diags) } /// Parse a regular expression node. This should be used instead of `parse()` @@ -172,18 +187,18 @@ extension Parser { /// RegexNode -> '' | Alternation /// Alternation -> Concatenation ('|' Concatenation)* /// - mutating func parseNode() throws -> AST.Node { - let _start = source.currentPosition + mutating func parseNode() -> AST.Node { + let _start = src.currentPosition - if source.isEmpty { return .empty(.init(loc(_start))) } + if src.isEmpty { return .empty(.init(loc(_start))) } - var result = [try parseConcatenation()] + var result = [parseConcatenation()] var pipes: [SourceLocation] = [] while true { - let pipeStart = source.currentPosition - guard source.tryEat("|") else { break } + let pipeStart = src.currentPosition + guard tryEat("|") else { break } pipes.append(loc(pipeStart)) - result.append(try parseConcatenation()) + result.append(parseConcatenation()) } if result.count == 1 { @@ -199,40 +214,39 @@ extension Parser { /// ConcatComponent -> Trivia | Quote | Quantification /// Quantification -> QuantOperand Quantifier? /// - mutating func parseConcatenation() throws -> AST.Node { + mutating func parseConcatenation() -> AST.Node { var result = [AST.Node]() - let _start = source.currentPosition + let _start = src.currentPosition while true { // Check for termination, e.g. of recursion or bin ops - if source.isEmpty { break } - if source.peek() == "|" || source.peek() == ")" { break } + if src.isEmpty { break } + if peek() == "|" || peek() == ")" { break } // TODO: refactor loop body into function - let _start = source.currentPosition + let _start = src.currentPosition // Trivia -> `lexTrivia` - if let triv = try source.lexTrivia(context: context) { + if let triv = lexTrivia() { result.append(.trivia(triv)) continue } // Quote -> `lexQuote` - if let quote = try source.lexQuote(context: context) { + if let quote = lexQuote() { result.append(.quote(quote)) continue } // Interpolation -> `lexInterpolation` - if let interpolation = try source.lexInterpolation() { + if let interpolation = lexInterpolation() { result.append(.interpolation(interpolation)) continue } // Quantification -> QuantOperand Quantifier? - if let operand = try parseQuantifierOperand() { - if let (amt, kind, trivia) = - try source.lexQuantifier(context: context) { + if let operand = parseQuantifierOperand() { + if let (amt, kind, trivia) = lexQuantifier() { let location = loc(_start) result.append(.quantification( .init(amt, kind, operand, location, trivia: trivia))) @@ -242,7 +256,8 @@ extension Parser { continue } - throw Unreachable("TODO: reason") + unreachable("Should have parsed at least an atom") + break } guard !result.isEmpty else { return .empty(.init(loc(_start))) @@ -257,30 +272,30 @@ extension Parser { /// Perform a recursive parse for the branches of a conditional. mutating func parseConditionalBranches( start: Source.Position, _ cond: AST.Conditional.Condition - ) throws -> AST.Node { - let child = try parseNode() + ) -> AST.Node { + let child = parseNode() let trueBranch: AST.Node, falseBranch: AST.Node, pipe: SourceLocation? switch child { case .alternation(let a): + pipe = a.pipes[0] + trueBranch = a.children[0] + falseBranch = a.children[1] + // If we have an alternation child, we only accept 2 branches. let numBranches = a.children.count guard numBranches == 2 else { - // TODO: Better API for the parser to throw located errors. - throw Source.LocatedError( - ParseError.tooManyBranchesInConditional(numBranches), child.location - ) + diags.error(.tooManyBranchesInConditional(numBranches), + at: child.location) + break } - trueBranch = a.children[0] - falseBranch = a.children[1] - pipe = a.pipes[0] default: // If there's no alternation, the child is assumed to be the true // branch, with the false branch matching anything. trueBranch = child - falseBranch = .empty(.init(loc(source.currentPosition))) + falseBranch = .empty(.init(loc(src.currentPosition))) pipe = nil } - try source.expect(")") + expect(")") return .conditional(.init( cond, trueBranch: trueBranch, pipe: pipe, falseBranch: falseBranch, loc(start))) @@ -290,7 +305,7 @@ extension Parser { /// current set of options. private mutating func applySyntaxOptions( of opts: AST.MatchingOptionSequence, isScoped: Bool - ) throws { + ) { func mapOption(_ option: SyntaxOptions, _ pred: (AST.MatchingOption) -> Bool) { if opts.resetsCurrentOptions { @@ -323,12 +338,9 @@ extension Parser { // An unscoped removal of extended syntax is not allowed in a multi-line // literal. if let opt = opts.removing.first(where: \.isAnyExtended) { - throw Source.LocatedError( - ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location) - } - if opts.resetsCurrentOptions { - throw Source.LocatedError( - ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!) + error(.cannotRemoveExtendedSyntaxInMultilineMode, at: opt.location) + } else if opts.resetsCurrentOptions { + error(.cannotResetExtendedSyntaxInMultilineMode, at: opts.caretLoc!) } // The only remaning case is an unscoped addition of extended syntax, // which is a no-op. @@ -343,36 +355,35 @@ extension Parser { /// current set of options. private mutating func applySyntaxOptions( of group: AST.Group.Kind, isScoped: Bool - ) throws { + ) { if case .changeMatchingOptions(let seq) = group { - try applySyntaxOptions(of: seq, isScoped: isScoped) + applySyntaxOptions(of: seq, isScoped: isScoped) } } /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located - ) throws -> AST.Group { + ) -> AST.Group { context.recordGroup(kind.value) let currentSyntax = context.syntax - try applySyntaxOptions(of: kind.value, isScoped: true) + applySyntaxOptions(of: kind.value, isScoped: true) defer { context.syntax = currentSyntax } let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) && !context.syntax.contains(.extendedSyntax) - let child = try parseNode() - try source.expect(")") + let child = parseNode() + expect(")") let groupLoc = loc(start) // In multi-line literals, the body of a group that unsets extended syntax // may not span multiple lines. if unsetsExtendedSyntax && context.syntax.contains(.multilineCompilerLiteral) && - source[child.location.range].spansMultipleLinesInRegexLiteral { - throw Source.LocatedError( - ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc) + src[child.location.range].spansMultipleLinesInRegexLiteral { + error(.unsetExtendedSyntaxMayNotSpanMultipleLines, at: groupLoc) } return .init(kind, child, groupLoc) } @@ -386,7 +397,7 @@ extension Parser { /// mutating func parseAbsentFunctionBody( _ start: AST.Located - ) throws -> AST.AbsentFunction { + ) -> AST.AbsentFunction { let startLoc = start.location // TODO: Diagnose on nested absent functions, which Oniguruma states is @@ -395,31 +406,31 @@ extension Parser { switch start.value { case .withoutPipe: // Must be a repeater. - kind = .repeater(try parseNode()) - case .withPipe where source.peek() == ")": + kind = .repeater(parseNode()) + case .withPipe where peek() == ")": kind = .clearer case .withPipe: // Can either be an expression or stopper depending on whether we have a // any additional '|'s. - let child = try parseNode() + let child = parseNode() switch child { case .alternation(let alt): // A pipe, so an expression. + kind = .expression( + absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1]) + let numChildren = alt.children.count guard numChildren == 2 else { - throw Source.LocatedError( - ParseError.tooManyAbsentExpressionChildren(numChildren), - child.location - ) + error(.tooManyAbsentExpressionChildren(numChildren), + at: child.location) + break } - kind = .expression( - absentee: alt.children[0], pipe: alt.pipes[0], expr: alt.children[1]) default: // No pipes, so a stopper. kind = .stopper(child) } } - try source.expect(")") + expect(")") return .init(kind, start: startLoc, location: loc(startLoc.start)) } @@ -431,44 +442,43 @@ extension Parser { /// Conditional -> CondStart Concatenation ('|' Concatenation)? ')' /// CondStart -> KnownCondStart | GroupCondStart /// - mutating func parseQuantifierOperand() throws -> AST.Node? { - assert(!source.isEmpty) + mutating func parseQuantifierOperand() -> AST.Node? { + assert(!src.isEmpty) - let _start = source.currentPosition + let _start = src.currentPosition // Check if we have the start of a conditional '(?(cond)', which can either // be a known condition, or an arbitrary group condition. - if let cond = try source.lexKnownConditionalStart(context: context) { - return try parseConditionalBranches(start: _start, cond) + if let cond = lexKnownConditionalStart() { + return parseConditionalBranches(start: _start, cond) } - if let kind = try source.lexGroupConditionalStart(context: context) { + if let kind = lexGroupConditionalStart() { let groupStart = kind.location.start - let group = try parseGroupBody(start: groupStart, kind) - return try parseConditionalBranches( + let group = parseGroupBody(start: groupStart, kind) + return parseConditionalBranches( start: _start, .init(.group(group), group.location)) } // Check if we have an Oniguruma absent function. - if let start = source.lexAbsentFunctionStart() { - return .absentFunction(try parseAbsentFunctionBody(start)) + if let start = lexAbsentFunctionStart() { + return .absentFunction(parseAbsentFunctionBody(start)) } // Check if we have the start of a group '('. - if let kind = try source.lexGroupStart(context: context) { - return .group(try parseGroupBody(start: _start, kind)) + if let kind = lexGroupStart() { + return .group(parseGroupBody(start: _start, kind)) } // Check if we have the start of a custom character class '['. - if let cccStart = source.lexCustomCCStart() { - return .customCharacterClass( - try parseCustomCharacterClass(cccStart)) + if let cccStart = lexCustomCCStart() { + return .customCharacterClass(parseCustomCharacterClass(cccStart)) } - if let atom = try source.lexAtom(context: context) { + if let atom = lexAtom() { // If we have a change matching options atom, apply the syntax options. We // already take care of scoping syntax options within a group. if case .changeMatchingOptions(let opts) = atom.kind { - try applySyntaxOptions(of: opts, isScoped: false) + applySyntaxOptions(of: opts, isScoped: false) } // TODO: track source locations return .atom(atom) @@ -493,19 +503,18 @@ extension Parser { /// mutating func parseCustomCharacterClass( _ start: Source.Located - ) throws -> CustomCC { + ) -> CustomCC { let alreadyInCCC = context.isInCustomCharacterClass context.isInCustomCharacterClass = true defer { context.isInCustomCharacterClass = alreadyInCCC } typealias Member = CustomCC.Member var members: Array = [] - try parseCCCMembers(into: &members) + parseCCCMembers(into: &members) // Make sure we have at least one semantic member. if members.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) + error(.expectedCustomCharacterClassMembers, at: start.location) } // If we have a binary set operator, parse it and the next members. Note @@ -513,40 +522,39 @@ extension Parser { // TODO: We may want to diagnose and require users to disambiguate, at least // for chains of separate operators. // TODO: What about precedence? - while let binOp = try source.lexCustomCCBinOp() { + while let binOp = lexCustomCCBinOp() { var rhs: Array = [] - try parseCCCMembers(into: &rhs) + parseCCCMembers(into: &rhs) if rhs.none(\.isSemantic) { - throw Source.LocatedError( - ParseError.expectedCustomCharacterClassMembers, start.location) + error(.expectedCustomCharacterClassMembers, at: start.location) } members = [.setOperation(members, binOp, rhs)] } - try source.expect("]") + expect("]") return CustomCC(start, members, loc(start.location.start)) } - mutating func parseCCCMember() throws -> CustomCC.Member? { - guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil + mutating func parseCCCMember() -> CustomCC.Member? { + guard !src.isEmpty && peek() != "]" && peekCCBinOp() == nil else { return nil } // Nested custom character class. - if let cccStart = source.lexCustomCCStart() { - return .custom(try parseCustomCharacterClass(cccStart)) + if let cccStart = lexCustomCCStart() { + return .custom(parseCustomCharacterClass(cccStart)) } // Quoted sequence. - if let quote = try source.lexQuote(context: context) { + if let quote = lexQuote() { return .quote(quote) } // Lex triva if we're allowed. - if let trivia = try source.lexTrivia(context: context) { + if let trivia = lexTrivia() { return .trivia(trivia) } - if let atom = try source.lexAtom(context: context) { + if let atom = lexAtom() { return .atom(atom) } return nil @@ -554,9 +562,7 @@ extension Parser { /// Attempt to parse a custom character class range into `members`, or regular /// members if a range cannot be formed. - mutating func parsePotentialCCRange( - into members: inout [CustomCC.Member] - ) throws { + mutating func parsePotentialCCRange(into members: inout [CustomCC.Member]) { guard let lhs = members.last, lhs.isSemantic else { return } // Try and see if we can parse a character class range. Each time we parse @@ -564,23 +570,21 @@ extension Parser { // being a range, and we bail. If we succeed in parsing, we remove the // intermediate members. let membersBeforeRange = members.count - 1 - while let t = try source.lexTrivia(context: context) { + while let t = lexTrivia() { members.append(.trivia(t)) } - guard let dash = source.lexCustomCharacterClassRangeOperator() else { - return - } + guard let dash = lexCustomCharacterClassRangeOperator() else { return } // If we can't parse a range, '-' becomes literal, e.g `[6-]`. members.append(.atom(.init(.char("-"), dash))) - while let t = try source.lexTrivia(context: context) { + while let t = lexTrivia() { members.append(.trivia(t)) } - guard let rhs = try parseCCCMember() else { return } + guard let rhs = parseCCCMember() else { return } members.append(rhs) - func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom { + func makeOperand(_ m: CustomCC.Member, isLHS: Bool) -> AST.Atom? { switch m { case .atom(let a): return a @@ -588,25 +592,23 @@ extension Parser { // Not supported. While .NET allows `x-[...]` to spell subtraction, we // require `x--[...]`. We also ban `[...]-x` for consistency. if isLHS { - throw Source.LocatedError( - ParseError.invalidCharacterClassRangeOperand, m.location) + error(.invalidCharacterClassRangeOperand, at: m.location) } else { - throw Source.LocatedError( - ParseError.unsupportedDotNetSubtraction, m.location) + error(.unsupportedDotNetSubtraction, at: m.location) } case .quote: // Currently unsupported, we need to figure out what the semantics // would be for grapheme/scalar modes. - throw Source.LocatedError( - ParseError.unsupported("range with quoted sequence"), m.location) + error(.unsupported("range with quoted sequence"), at: m.location) case .trivia: - throw Unreachable("Should have been lexed separately") + unreachable("Should have been lexed separately") case .range, .setOperation: - throw Unreachable("Parsed later") + unreachable("Parsed later") } + return nil } - let lhsOp = try makeOperand(lhs, isLHS: true) - let rhsOp = try makeOperand(rhs, isLHS: false) + guard let lhsOp = makeOperand(lhs, isLHS: true), + let rhsOp = makeOperand(rhs, isLHS: false) else { return } // We've successfully parsed an atom LHS and RHS, so form a range, // collecting the trivia we've parsed, and replacing the members that @@ -619,49 +621,35 @@ extension Parser { // We need to specially check if we can lex a .NET character class // subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd // treat the second `-` as literal. - if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) { - throw Source.LocatedError( - ParseError.unsupportedDotNetSubtraction, dashLoc) + if let dashLoc = canLexDotNetCharClassSubtraction() { + error(.unsupportedDotNetSubtraction, at: dashLoc) } } - mutating func parseCCCMembers( - into members: inout Array - ) throws { + mutating func parseCCCMembers(into members: inout Array) { // Parse members and ranges until we see the end of the custom char class // or an operator. - while let member = try parseCCCMember() { + while let member = parseCCCMember() { members.append(member) - try parsePotentialCCRange(into: &members) + parsePotentialCCRange(into: &members) } } } -public enum ASTStage { - /// The regex is parsed, and a syntactically valid AST is returned. Otherwise - /// an error is thrown. This is useful for e.g syntax coloring. - case syntactic - - /// The regex is parsed, and a syntactically and semantically valid AST is - /// returned. Otherwise an error is thrown. A semantically valid AST has been - /// checked for e.g unsupported constructs and invalid backreferences. - case semantic +public func parseWithRecovery( + _ regex: S, _ syntax: SyntaxOptions +) -> AST where S.SubSequence == Substring +{ + let source = Source(String(regex)) + var parser = Parser(source, syntax: syntax) + return validate(parser.parse()) } public func parse( - _ regex: S, _ stage: ASTStage, _ syntax: SyntaxOptions + _ regex: S, _ syntax: SyntaxOptions ) throws -> AST where S.SubSequence == Substring { - let source = Source(String(regex)) - var parser = Parser(source, syntax: syntax) - let ast = try parser.parse() - switch stage { - case .syntactic: - break - case .semantic: - try validate(ast) - } - return ast + try parseWithRecovery(regex, syntax).ensureValid() } extension StringProtocol { @@ -691,15 +679,25 @@ fileprivate func defaultSyntaxOptions( } } +/// Parses a given regex string with delimiters, inferring the syntax options +/// from the delimiters used. +public func parseWithDelimitersWithRecovery( + _ regex: S +) -> AST where S.SubSequence == Substring { + let (contents, delim) = droppingRegexDelimiters(String(regex)) + let syntax = defaultSyntaxOptions(delim, contents: contents) + return parseWithRecovery(contents, syntax) +} + /// Parses a given regex string with delimiters, inferring the syntax options /// from the delimiters used. public func parseWithDelimiters( - _ regex: S, _ stage: ASTStage + _ regex: S ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) + let syntax = defaultSyntaxOptions(delim, contents: contents) do { - let syntax = defaultSyntaxOptions(delim, contents: contents) - return try parse(contents, stage, syntax) + return try parseWithRecovery(contents, syntax).ensureValid() } catch let error as LocatedErrorProtocol { // Convert the range in 'contents' to the range in 'regex'. let delimCount = delim.opening.count diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index c49436702..1b9da3e50 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -14,14 +14,18 @@ fileprivate struct RegexValidator { let ast: AST let captures: CaptureList + var diags = Diagnostics() init(_ ast: AST) { self.ast = ast self.captures = ast.captureList } - func error(_ kind: ParseError, at loc: SourceLocation) -> Error { - Source.LocatedError(kind, loc) + mutating func error(_ kind: ParseError, at loc: SourceLocation) { + diags.error(kind, at: loc) + } + mutating func unreachable(_ str: String, at loc: SourceLocation) { + diags.fatal(.unreachable(str), at: loc) } } @@ -30,88 +34,112 @@ extension String { } extension RegexValidator { - func validate() throws { + mutating func validate() -> AST { for opt in ast.globalOptions?.options ?? [] { - try validateGlobalMatchingOption(opt) + validateGlobalMatchingOption(opt) } - try validateCaptures() - try validateNode(ast.root) + validateCaptures() + validateNode(ast.root) + + var result = ast + result.diags.append(contentsOf: diags) + return result } - func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) throws { + /// Called when some piece of invalid AST is encountered. We want to ensure + /// an error was emitted. + mutating func expectInvalid(at loc: SourceLocation) { + guard ast.diags.hasAnyError else { + unreachable("Invalid, but no error emitted?", at: loc) + return + } + } + + mutating func validateGlobalMatchingOption(_ opt: AST.GlobalMatchingOption) { switch opt.kind { case .limitDepth, .limitHeap, .limitMatch, .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties: // These are PCRE specific, and not something we're likely to ever // support. - throw error(.unsupported("global matching option"), at: opt.location) + error(.unsupported("global matching option"), at: opt.location) case .newlineMatching: // We have implemented the correct behavior for multi-line literals, but // these should also affect '.' and '\N' matching, which we haven't // implemented. - throw error(.unsupported("newline matching mode"), at: opt.location) + error(.unsupported("newline matching mode"), at: opt.location) case .newlineSequenceMatching: // We haven't yet implemented the '\R' matching specifics of these. - throw error( - .unsupported("newline sequence matching mode"), at: opt.location) + error(.unsupported("newline sequence matching mode"), at: opt.location) } } - func validateCaptures() throws { + mutating func validateCaptures() { // TODO: Should this be validated when creating the capture list? var usedNames = Set() for capture in captures.captures { guard let name = capture.name else { continue } - guard usedNames.insert(name).inserted else { - throw error(.duplicateNamedCapture(name), at: capture.location) + if !usedNames.insert(name).inserted { + error(.duplicateNamedCapture(name), at: capture.location) } } } - func validateReference(_ ref: AST.Reference) throws { + mutating func validateReference(_ ref: AST.Reference) { if let recLevel = ref.recursionLevel { - throw error(.unsupported("recursion level"), at: recLevel.location) + error(.unsupported("recursion level"), at: recLevel.location) } switch ref.kind { - case .absolute(let i): - guard i < captures.captures.count else { - throw error(.invalidReference(i), at: ref.innerLoc) + case .absolute(let num): + guard let i = num.value else { + // Should have already been diagnosed. + expectInvalid(at: ref.innerLoc) + break + } + if i >= captures.captures.count { + error(.invalidReference(i), at: ref.innerLoc) } case .named(let name): - guard captures.hasCapture(named: name) else { - throw error(.invalidNamedReference(name), at: ref.innerLoc) + // An empty name is already invalid, so don't bother validating. + guard !name.isEmpty else { break } + if !captures.hasCapture(named: name) { + error(.invalidNamedReference(name), at: ref.innerLoc) + } + case .relative(let num): + guard let _ = num.value else { + // Should have already been diagnosed. + expectInvalid(at: ref.innerLoc) + break } - case .relative: - throw error(.unsupported("relative capture reference"), at: ref.innerLoc) + error(.unsupported("relative capture reference"), at: ref.innerLoc) } } - func validateMatchingOption(_ opt: AST.MatchingOption) throws { + mutating func validateMatchingOption(_ opt: AST.MatchingOption) { let loc = opt.location switch opt.kind { case .allowDuplicateGroupNames: // Not currently supported as we need to figure out what to do with // the capture type. - throw error(.unsupported("duplicate group naming"), at: loc) + error(.unsupported("duplicate group naming"), at: loc) case .unicodeWordBoundaries: - throw error(.unsupported("unicode word boundary mode"), at: loc) + error(.unsupported("unicode word boundary mode"), at: loc) case .textSegmentWordMode, .textSegmentGraphemeMode: - throw error(.unsupported("text segment mode"), at: loc) + error(.unsupported("text segment mode"), at: loc) case .byteSemantics: - throw error(.unsupported("byte semantic mode"), at: loc) + error(.unsupported("byte semantic mode"), at: loc) case .unicodeScalarSemantics: - throw error(.unsupported("unicode scalar semantic mode"), at: loc) - + error(.unsupported("unicode scalar semantic mode"), at: loc) + case .graphemeClusterSemantics: - throw error(.unsupported("grapheme semantic mode"), at: loc) - + error(.unsupported("grapheme semantic mode"), at: loc) + case .caseInsensitive, .possessiveByDefault, .reluctantByDefault, .singleLine, .multiline, .namedCapturesOnly, .extended, .extraExtended, .asciiOnlyDigit, .asciiOnlyWord, .asciiOnlySpace, .asciiOnlyPOSIXProps: @@ -119,18 +147,18 @@ extension RegexValidator { } } - func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) throws { + mutating func validateMatchingOptions(_ opts: AST.MatchingOptionSequence) { for opt in opts.adding { - try validateMatchingOption(opt) + validateMatchingOption(opt) } for opt in opts.removing { - try validateMatchingOption(opt) + validateMatchingOption(opt) } } - func validateBinaryProperty( + mutating func validateBinaryProperty( _ prop: Unicode.BinaryProperty, at loc: SourceLocation - ) throws { + ) { switch prop { case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased, .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped, @@ -153,46 +181,49 @@ extension RegexValidator { break case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: - throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) + error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) case .compositionExclusion, .emojiComponent, .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, .otherUppercase, .prependedConcatenationMark: - throw error(.unsupported(prop.rawValue.quoted), at: loc) + error(.unsupported(prop.rawValue.quoted), at: loc) } } - func validateCharacterProperty( + mutating func validateCharacterProperty( _ prop: AST.Atom.CharacterProperty, at loc: SourceLocation - ) throws { + ) { // TODO: We could re-add the .other case to diagnose unknown properties // here instead of in the parser. // TODO: Should we store an 'inner location' for the contents of `\p{...}`? switch prop.kind { case .binary(let b, _): - try validateBinaryProperty(b, at: loc) + validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc: break + case .invalid: + // Should have already been diagnosed. + expectInvalid(at: loc) case .pcreSpecial: - throw error(.unsupported("PCRE property"), at: loc) + error(.unsupported("PCRE property"), at: loc) case .block: - throw error(.unsupported("Unicode block property"), at: loc) + error(.unsupported("Unicode block property"), at: loc) case .javaSpecial: - throw error(.unsupported("Java property"), at: loc) + error(.unsupported("Java property"), at: loc) } } - func validateEscaped( + mutating func validateEscaped( _ esc: AST.Atom.EscapedBuiltin, at loc: SourceLocation - ) throws { + ) { switch esc { case .resetStartOfMatch, .singleDataUnit, // '\N' needs to be emitted using 'emitAny'. .notNewline: - throw error(.unsupported("'\\\(esc.character)'"), at: loc) + error(.unsupported("'\\\(esc.character)'"), at: loc) // Character classes. case .decimalDigit, .notDecimalDigit, .whitespace, .notWhitespace, @@ -217,34 +248,34 @@ extension RegexValidator { } } - func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) throws { + mutating func validateAtom(_ atom: AST.Atom, inCustomCharacterClass: Bool) { switch atom.kind { case .escaped(let esc): - try validateEscaped(esc, at: atom.location) + validateEscaped(esc, at: atom.location) case .keyboardControl, .keyboardMeta, .keyboardMetaControl: // We need to implement the scalar computations for these. - throw error(.unsupported("control sequence"), at: atom.location) + error(.unsupported("control sequence"), at: atom.location) case .property(let p): - try validateCharacterProperty(p, at: atom.location) + validateCharacterProperty(p, at: atom.location) case .backreference(let r): - try validateReference(r) + validateReference(r) case .subpattern: - throw error(.unsupported("subpattern"), at: atom.location) + error(.unsupported("subpattern"), at: atom.location) case .callout: // These are PCRE and Oniguruma specific, supporting them is future work. - throw error(.unsupported("callout"), at: atom.location) + error(.unsupported("callout"), at: atom.location) case .backtrackingDirective: // These are PCRE-specific, and are unlikely to be fully supported. - throw error(.unsupported("backtracking directive"), at: atom.location) + error(.unsupported("backtracking directive"), at: atom.location) case .changeMatchingOptions(let opts): - try validateMatchingOptions(opts) + validateMatchingOptions(opts) case .namedCharacter: // TODO: We should error on unknown Unicode scalar names. @@ -253,77 +284,89 @@ extension RegexValidator { case .scalarSequence: // Not currently supported in a custom character class. if inCustomCharacterClass { - throw error(.unsupported("scalar sequence in custom character class"), - at: atom.location) + error(.unsupported("scalar sequence in custom character class"), + at: atom.location) } case .char, .scalar, .startOfLine, .endOfLine, .any: break + + case .invalid: + // Should have already been diagnosed. + expectInvalid(at: atom.location) + break } } - func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) throws { + mutating func validateCustomCharacterClass(_ c: AST.CustomCharacterClass) { for member in c.members { - try validateCharacterClassMember(member) + validateCharacterClassMember(member) } } - func validateCharacterClassRange( + mutating func validateCharacterClassRange( _ range: AST.CustomCharacterClass.Range - ) throws { + ) { let lhs = range.lhs let rhs = range.rhs - try validateAtom(lhs, inCustomCharacterClass: true) - try validateAtom(rhs, inCustomCharacterClass: true) + validateAtom(lhs, inCustomCharacterClass: true) + validateAtom(rhs, inCustomCharacterClass: true) guard lhs.isValidCharacterClassRangeBound else { - throw error(.invalidCharacterClassRangeOperand, at: lhs.location) + error(.invalidCharacterClassRangeOperand, at: lhs.location) + return } guard rhs.isValidCharacterClassRangeBound else { - throw error(.invalidCharacterClassRangeOperand, at: rhs.location) + error(.invalidCharacterClassRangeOperand, at: rhs.location) + return } guard let lhsChar = lhs.literalCharacterValue else { - throw error( + error( .unsupported("character class range operand"), at: lhs.location) + return } guard let rhsChar = rhs.literalCharacterValue else { - throw error( + error( .unsupported("character class range operand"), at: rhs.location) + return } - guard lhsChar <= rhsChar else { - throw error( + if lhsChar > rhsChar { + error( .invalidCharacterRange(from: lhsChar, to: rhsChar), at: range.dashLoc) } } - func validateCharacterClassMember( + mutating func validateCharacterClassMember( _ member: AST.CustomCharacterClass.Member - ) throws { + ) { switch member { case .custom(let c): - try validateCustomCharacterClass(c) + validateCustomCharacterClass(c) case .range(let r): - try validateCharacterClassRange(r) + validateCharacterClassRange(r) case .atom(let a): - try validateAtom(a, inCustomCharacterClass: true) + validateAtom(a, inCustomCharacterClass: true) case .setOperation(let lhs, _, let rhs): - for lh in lhs { try validateCharacterClassMember(lh) } - for rh in rhs { try validateCharacterClassMember(rh) } + for lh in lhs { validateCharacterClassMember(lh) } + for rh in rhs { validateCharacterClassMember(rh) } case .quote, .trivia: break } } - func validateGroup(_ group: AST.Group) throws { + mutating func validateGroup(_ group: AST.Group) { let kind = group.kind + if let name = kind.value.name, name.isEmpty { + expectInvalid(at: kind.location) + } switch kind.value { case .capture, .namedCapture, .nonCapture, .lookahead, .negativeLookahead, .atomicNonCapturing: @@ -331,79 +374,83 @@ extension RegexValidator { case .balancedCapture: // These are .NET specific, and kinda niche. - throw error(.unsupported("balanced capture"), at: kind.location) + error(.unsupported("balanced capture"), at: kind.location) case .nonCaptureReset: // We need to figure out how these interact with typed captures. - throw error(.unsupported("branch reset group"), at: kind.location) + error(.unsupported("branch reset group"), at: kind.location) case .nonAtomicLookahead: - throw error(.unsupported("non-atomic lookahead"), at: kind.location) + error(.unsupported("non-atomic lookahead"), at: kind.location) case .lookbehind, .negativeLookbehind, .nonAtomicLookbehind: - throw error(.unsupported("lookbehind"), at: kind.location) + error(.unsupported("lookbehind"), at: kind.location) case .scriptRun, .atomicScriptRun: - throw error(.unsupported("script run"), at: kind.location) + error(.unsupported("script run"), at: kind.location) case .changeMatchingOptions(let opts): - try validateMatchingOptions(opts) + validateMatchingOptions(opts) } - try validateNode(group.child) + validateNode(group.child) } - func validateQuantification(_ quant: AST.Quantification) throws { - try validateNode(quant.child) - guard quant.child.isQuantifiable else { - throw error(.notQuantifiable, at: quant.child.location) + mutating func validateQuantification(_ quant: AST.Quantification) { + validateNode(quant.child) + if !quant.child.isQuantifiable { + error(.notQuantifiable, at: quant.child.location) } switch quant.amount.value { case .range(let lhs, let rhs): - guard lhs.value <= rhs.value else { - throw error( - .invalidQuantifierRange(lhs.value, rhs.value), at: quant.location) + guard let lhs = lhs.value, let rhs = rhs.value else { + // Should have already been diagnosed. + expectInvalid(at: quant.location) + break + } + if lhs > rhs { + error(.invalidQuantifierRange(lhs, rhs), at: quant.location) } case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN: break } } - func validateNode(_ node: AST.Node) throws { + mutating func validateNode(_ node: AST.Node) { switch node { case .alternation(let a): for branch in a.children { - try validateNode(branch) + validateNode(branch) } case .concatenation(let c): for child in c.children { - try validateNode(child) + validateNode(child) } case .group(let g): - try validateGroup(g) + validateGroup(g) case .conditional(let c): // Note even once we get runtime support for this, we need to change the // parsing to incorporate what is specified in the syntax proposal. - throw error(.unsupported("conditional"), at: c.location) + error(.unsupported("conditional"), at: c.location) case .quantification(let q): - try validateQuantification(q) + validateQuantification(q) case .atom(let a): - try validateAtom(a, inCustomCharacterClass: false) + validateAtom(a, inCustomCharacterClass: false) case .customCharacterClass(let c): - try validateCustomCharacterClass(c) + validateCustomCharacterClass(c) case .absentFunction(let a): // These are Oniguruma specific. - throw error(.unsupported("absent function"), at: a.location) + error(.unsupported("absent function"), at: a.location) case .interpolation(let i): // This is currently rejected in the parser for better diagnostics, but // reject here too until we get runtime support. - throw error(.unsupported("interpolation"), at: i.location) + error(.unsupported("interpolation"), at: i.location) case .quote, .trivia, .empty: break @@ -412,6 +459,7 @@ extension RegexValidator { } /// Check a regex AST for semantic validity. -public func validate(_ ast: AST) throws { - try RegexValidator(ast).validate() +public func validate(_ ast: AST) -> AST { + var validator = RegexValidator(ast) + return validator.validate() } diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift index 23cc0497d..22715ebc3 100644 --- a/Sources/_RegexParser/Regex/Parse/Source.swift +++ b/Sources/_RegexParser/Regex/Parse/Source.swift @@ -52,80 +52,20 @@ extension Source { func peek() -> Char? { _slice.first } - mutating func advance() { - assert(!isEmpty) - let newLower = _slice.index(after: bounds.lowerBound) - self.bounds = newLower ..< bounds.upperBound - } - - mutating func advance(_ i: Int) { - for _ in 0.. Bool { + guard n > 0, let newLower = _slice.index( + bounds.lowerBound, offsetBy: n, limitedBy: bounds.upperBound) + else { + return false } - } - - mutating func tryEat(_ c: Char) -> Bool { - guard peek() == c else { return false } - advance() - return true - } - - mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool { - guard let next = peek(), try pred(next) else { return false } - advance() - return true - } - - mutating func tryEat(sequence c: C) -> Bool - where C.Element == Char { - guard _slice.starts(with: c) else { return false } - advance(c.count) + self.bounds = newLower ..< bounds.upperBound return true } - mutating func tryEat(anyOf set: C) -> Char? - where C.Element == Char - { - guard let c = peek(), set.contains(c) else { return nil } - advance() - return c - } - mutating func tryEat(anyOf set: Char...) -> Char? { - tryEat(anyOf: set) - } - - /// Try to eat any character, returning `nil` if the input has been exhausted. - mutating func tryEat() -> Char? { - guard !isEmpty else { return nil } - return eat() - } - - mutating func eat(asserting c: Char) { - assert(peek() == c) - advance() - } - - mutating func eat() -> Char { - assert(!isEmpty) - defer { advance() } - return peek().unsafelyUnwrapped - } - - func starts( - with s: S - ) -> Bool where S.Element == Char { - _slice.starts(with: s) - } - - mutating func eat(upTo: Position) -> Input.SubSequence { - defer { - while _slice.startIndex != upTo { advance() } - } - return _slice[.. Input.SubSequence { let pre = _slice.prefix(count) - defer { advance(pre.count) } + tryAdvance(pre.count) return pre } @@ -134,10 +74,20 @@ extension Source { _ f: (Char) -> Bool ) -> Input.SubSequence? { guard let pre = peekPrefix(maxLength: maxLength, f) else { return nil } - defer { self.advance(pre.count) } + tryAdvance(pre.count) return pre } + mutating func tryEat(count: Int) -> Input.SubSequence? { + let pre = _slice.prefix(count) + guard tryAdvance(count) else { return nil } + return pre + } + + func starts(with s: S) -> Bool where S.Element == Char { + _slice.starts(with: s) + } + func peekPrefix( maxLength: Int? = nil, _ f: (Char) -> Bool @@ -153,11 +103,4 @@ extension Source { return pre } - - mutating func tryEat(count: Int) -> Input.SubSequence? { - let pre = _slice.prefix(count) - guard pre.count == count else { return nil } - defer { advance(count) } - return pre - } } diff --git a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift index eb51643bd..6f6928d2f 100644 --- a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift +++ b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift @@ -126,3 +126,13 @@ extension Source.LocatedError: CustomStringConvertible { return error } } + +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 10e50d712..48a2512cf 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -167,12 +167,21 @@ extension AST.Atom { case .changeMatchingOptions(let opts): return "changeMatchingOptions<\(opts)>" + case .invalid: + return "" + case .char, .scalar: fatalError("Unreachable") } } } +extension AST.Atom.Number: _ASTPrintable { + public var _dumpBase: String { + value.map { "\($0)" } ?? "" + } +} + extension AST.Atom.Callout: _ASTPrintable { public var _dumpBase: String { switch self { @@ -227,7 +236,7 @@ extension AST.Reference: _ASTPrintable { public var _dumpBase: String { var result = "\(kind)" if let recursionLevel = recursionLevel { - result += "\(recursionLevel.value)" + result += "\(recursionLevel)" } return result } @@ -270,11 +279,11 @@ extension AST.Quantification.Amount: _ASTPrintable { case .zeroOrMore: return "zeroOrMore" case .oneOrMore: return "oneOrMore" case .zeroOrOne: return "zeroOrOne" - case let .exactly(n): return "exactly<\(n.value)>" - case let .nOrMore(n): return "nOrMore<\(n.value)>" - case let .upToN(n): return "uptoN<\(n.value)>" + case let .exactly(n): return "exactly<\(n)>" + case let .nOrMore(n): return "nOrMore<\(n)>" + case let .upToN(n): return "uptoN<\(n)>" case let .range(lower, upper): - return ".range<\(lower.value)...\(upper.value)>" + return ".range<\(lower)...\(upper)>" } } } diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index ac553a115..0e7cfb1d3 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -32,7 +32,7 @@ extension AST.Node { showDelimiters delimiters: Bool = false, terminateLine: Bool = false ) -> String { - AST(self, globalOptions: nil).renderAsCanonical( + AST(self, globalOptions: nil, diags: Diagnostics()).renderAsCanonical( showDelimiters: delimiters, terminateLine: terminateLine) } } @@ -217,9 +217,9 @@ extension AST.Quantification.Amount { case .zeroOrMore: return "*" case .oneOrMore: return "+" case .zeroOrOne: return "?" - case let .exactly(n): return "{\(n.value)}" - case let .nOrMore(n): return "{\(n.value),}" - case let .upToN(n): return "{,\(n.value)}" + case let .exactly(n): return "{\(n._canonicalBase)}" + case let .nOrMore(n): return "{\(n._canonicalBase),}" + case let .upToN(n): return "{,\(n._canonicalBase)}" case let .range(lower, upper): return "{\(lower),\(upper)}" } @@ -229,6 +229,12 @@ extension AST.Quantification.Kind { var _canonicalBase: String { self.rawValue } } +extension AST.Atom.Number { + var _canonicalBase: String { + value.map { "\($0)" } ?? "<#number#>" + } +} + extension AST.Atom { var _canonicalBase: String { if let anchor = self.assertionKind { @@ -305,9 +311,9 @@ extension AST.GlobalMatchingOption.NewlineSequenceMatching { extension AST.GlobalMatchingOption.Kind { var _canonicalBase: String { switch self { - case .limitDepth(let i): return "LIMIT_DEPTH=\(i.value)" - case .limitHeap(let i): return "LIMIT_HEAP=\(i.value)" - case .limitMatch(let i): return "LIMIT_MATCH=\(i.value)" + case .limitDepth(let i): return "LIMIT_DEPTH=\(i._canonicalBase)" + case .limitHeap(let i): return "LIMIT_HEAP=\(i._canonicalBase)" + case .limitMatch(let i): return "LIMIT_MATCH=\(i._canonicalBase)" case .notEmpty: return "NOTEMPTY" case .notEmptyAtStart: return "NOTEMPTY_ATSTART" case .noAutoPossess: return "NO_AUTO_POSSESS" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index bcfc8a2c2..9165f38b1 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -76,7 +76,10 @@ fileprivate extension Compiler.ByteCodeGen { } switch ref.kind { - case .absolute(let i): + case .absolute(let n): + guard let i = n.value else { + throw Unreachable("Expected a value") + } builder.buildBackreference(.init(i)) case .named(let name): try builder.buildNamedReference(name) @@ -442,6 +445,9 @@ fileprivate extension Compiler.ByteCodeGen { } let (low, high) = amount.bounds + guard let low = low else { + throw Unreachable("Must have a lower bound") + } switch (low, high) { case (_, 0): // TODO: Should error out earlier, maybe DSL and parser diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 601cd52a4..b4f6a7a83 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -37,7 +37,7 @@ class Compiler { func _compileRegex( _ regex: String, _ syntax: SyntaxOptions = .traditional ) throws -> Executor { - let ast = try parse(regex, .semantic, syntax) + let ast = try parse(regex, syntax) let program = try Compiler(ast: ast).emit() return Executor(program: program) } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a912fd136..fed21a9a8 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -227,7 +227,7 @@ extension AST.Atom { case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, .backreference, .subpattern, .callout, - .backtrackingDirective, .changeMatchingOptions: + .backtrackingDirective, .changeMatchingOptions, .invalid: // FIXME: implement return nil } @@ -521,6 +521,9 @@ extension AST.Atom.CharacterProperty { case .javaSpecial(let s): throw Unsupported("TODO: map Java special: \(s)") + + case .invalid: + throw Unreachable("Expected valid property") } }() diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 2fe7c6ccc..b29053e14 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -56,7 +56,7 @@ extension PrettyPrinter { mutating func printBackoff(_ node: DSLTree.Node) { precondition(node.astNode != nil, "unconverted node") printAsCanonical( - .init(node.astNode!, globalOptions: nil), + .init(node.astNode!, globalOptions: nil, diags: Diagnostics()), delimiters: true) } @@ -931,6 +931,10 @@ extension AST.Atom { case .char, .scalar, .scalarSequence: return literalStringValue! + case .invalid: + // TODO: Can we recover the original regex text from the source range? + return "<#value#>" + case let .property(p): return p._regexBase @@ -973,16 +977,22 @@ extension AST.Atom { } } +extension AST.Atom.Number { + var _patternBase: String { + value.map { "\($0)" } ?? "<#number#>" + } +} + extension AST.Quantification.Amount { var _patternBase: String { switch self { case .zeroOrMore: return "ZeroOrMore" case .oneOrMore: return "OneOrMore" case .zeroOrOne: return "Optionally" - case let .exactly(n): return "Repeat(count: \(n.value))" - case let .nOrMore(n): return "Repeat(\(n.value)...)" - case let .upToN(n): return "Repeat(...\(n.value))" - case let .range(n, m): return "Repeat(\(n.value)...\(m.value))" + case let .exactly(n): return "Repeat(count: \(n._patternBase))" + case let .nOrMore(n): return "Repeat(\(n._patternBase)...)" + case let .upToN(n): return "Repeat(...\(n._patternBase))" + case let .range(n, m): return "Repeat(\(n._patternBase)...\(m._patternBase))" } } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 156dd7220..2683d367a 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -143,7 +143,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter pattern: The regular expression. public init(_ pattern: String) throws { - self.init(ast: try parse(pattern, .semantic, .traditional)) + self.init(ast: try parse(pattern, .traditional)) } } @@ -157,7 +157,7 @@ extension Regex { _ pattern: String, as: Output.Type = Output.self ) throws { - self.init(ast: try parse(pattern, .semantic, .traditional)) + self.init(ast: try parse(pattern, .traditional)) } /// Produces a regex that matches `verbatim` exactly, as though every diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 410f3d14c..2a3a8fc43 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -37,13 +37,14 @@ public struct Regex: RegexComponent { self.program = Program(ast: ast) } init(ast: AST.Node) { - self.program = Program(ast: .init(ast, globalOptions: nil)) + self.program = Program(ast: + .init(ast, globalOptions: nil, diags: Diagnostics())) } // Compiler interface. Do not change independently. @usableFromInline init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .semantic, .traditional)) + self.init(ast: try! parse(pattern, .traditional)) } // Compiler interface. Do not change independently. @@ -52,7 +53,7 @@ public struct Regex: RegexComponent { assert(version == currentRegexLiteralFormatVersion) // The version argument is passed by the compiler using the value defined // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern, .semantic)) + self.init(ast: try! parseWithDelimiters(pattern)) } public var regex: Regex { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f28d5a8b4..cc1fb01a0 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -301,16 +301,6 @@ extension DSLTree { } } -extension DSLTree { - var ast: AST? { - guard let root = root.astNode else { - return nil - } - // TODO: Options mapping - return AST(root, globalOptions: nil) - } -} - extension DSLTree { var hasCapture: Bool { root.hasCapture @@ -685,16 +675,16 @@ extension DSLTree { .init(ast: .zeroOrOne) } public static func exactly(_ n: Int) -> Self { - .init(ast: .exactly(.init(faking: n))) + .init(ast: .exactly(.init(n, at: .fake))) } public static func nOrMore(_ n: Int) -> Self { - .init(ast: .nOrMore(.init(faking: n))) + .init(ast: .nOrMore(.init(n, at: .fake))) } public static func upToN(_ n: Int) -> Self { - .init(ast: .upToN(.init(faking: n))) + .init(ast: .upToN(.init(n, at: .fake))) } public static func range(_ lower: Int, _ upper: Int) -> Self { - .init(ast: .range(.init(faking: lower), .init(faking: upper))) + .init(ast: .range(.init(lower, at: .fake), .init(upper, at: .fake))) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 49a08430d..49f9e9b11 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -48,7 +48,7 @@ func empty() -> AST.Node { } func ast(_ root: AST.Node, opts: [AST.GlobalMatchingOption.Kind]) -> AST { - .init(root, globalOptions: .init(opts.map { .init($0, .fake) })) + .init(root, globalOptions: .init(opts.map { .init($0, .fake) }), diags: Diagnostics()) } func ast(_ root: AST.Node, opts: AST.GlobalMatchingOption.Kind...) -> AST { @@ -154,20 +154,39 @@ func unsetMatchingOptions( unsetMatchingOptions(adding: adding) } -func ref(_ i: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.absolute(i), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(_ n: Int?) -> AST.Reference.Kind { + .absolute(.init(n, at: .fake)) } -func ref(plus n: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.relative(n), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(plus n: Int?) -> AST.Reference.Kind { + .relative(.init(n, at: .fake)) } -func ref(minus n: Int, recursionLevel: Int? = nil) -> AST.Reference { - .init(.relative(-n), recursionLevel: recursionLevel.map { .init(faking: $0) }, - innerLoc: .fake) +func ref(minus n: Int?) -> AST.Reference.Kind { + .relative(.init(n.map { x in -x }, at: .fake)) +} +func ref(named n: String) -> AST.Reference.Kind { + .named(n) +} + +func ref(_ n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) +} +func ref(plus n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(plus: n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) +} +func ref(minus n: Int?, recursionLevel: Int? = nil) -> AST.Reference { + .init( + ref(minus: n), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, + innerLoc: .fake + ) } func ref(_ s: String, recursionLevel: Int? = nil) -> AST.Reference { - .init(.named(s), recursionLevel: recursionLevel.map { .init(faking: $0) }, + .init(.named(s), recursionLevel: recursionLevel.map { .init($0, at: .fake) }, innerLoc: .fake) } func conditional( @@ -179,10 +198,11 @@ func conditional( } func pcreVersionCheck( _ kind: AST.Conditional.Condition.PCREVersionCheck.Kind, - _ major: Int, _ minor: Int + _ major: Int?, _ minor: Int? ) -> AST.Conditional.Condition.Kind { .pcreVersionCheck(.init( - .init(faking: kind), .init(major: major, minor: minor, .fake) + .init(faking: kind), .init(major: .init(major, at: .fake), + minor: .init(minor, at: .fake), .fake) )) } func groupCondition( @@ -191,8 +211,11 @@ func groupCondition( .group(.init(.init(faking: kind), child, .fake)) } -func pcreCallout(_ arg: AST.Atom.Callout.PCRE.Argument) -> AST.Node { - atom(.callout(.pcre(.init(.init(faking: arg))))) +func pcreCallout(number: Int?) -> AST.Node { + atom(.callout(.pcre(.init(.init(faking: .number(.init(number, at: .fake))))))) +} +func pcreCallout(string: String) -> AST.Node { + atom(.callout(.pcre(.init(.init(faking: .string(string)))))) } func absentRepeater(_ child: AST.Node) -> AST.Node { @@ -268,34 +291,34 @@ func oneOrMore( quant(.oneOrMore, kind, child) } func exactly( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.exactly(.init(faking: i)), kind, child) + quant(.exactly(.init(i, at: .fake)), kind, child) } func nOrMore( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.nOrMore(.init(faking: i)), kind, child) + quant(.nOrMore(.init(i, at: .fake)), kind, child) } func upToN( - _ i: Int, + _ i: Int?, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - quant(.upToN(.init(faking: i)), kind, child) + quant(.upToN(.init(i, at: .fake)), kind, child) } func quantRange( _ r: ClosedRange, _ kind: AST.Quantification.Kind = .eager, of child: AST.Node ) -> AST.Node { - let lower = AST.Located(faking: r.lowerBound) - let upper = AST.Located(faking: r.upperBound) - return quant(.range(lower, upper), kind, child) + quant(.range( + .init(r.lowerBound, at: .fake), .init(r.upperBound, at: .fake) + ), kind, child) } func charClass( @@ -370,7 +393,7 @@ func scalarSeq_m(_ s: Unicode.Scalar...) -> AST.CustomCharacterClass.Member { func backreference(_ r: AST.Reference.Kind, recursionLevel: Int? = nil) -> AST.Node { atom(.backreference(.init( - r, recursionLevel: recursionLevel.map { .init(faking: $0) }, innerLoc: .fake + r, recursionLevel: recursionLevel.map { .init($0, at: .fake) }, innerLoc: .fake ))) } func subpattern(_ r: AST.Reference.Kind) -> AST.Node { diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 5d5a2b349..952b65ec6 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -157,7 +157,7 @@ func captureTest( file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(regex, .semantic, .traditional) + let ast = try! parse(regex, .traditional) var capList = ast.captureList.withoutLocs // Peel off the whole match element. capList.captures.removeFirst() diff --git a/Tests/RegexTests/DiagnosticTests.swift b/Tests/RegexTests/DiagnosticTests.swift index 0100a3a86..1a3606bf5 100644 --- a/Tests/RegexTests/DiagnosticTests.swift +++ b/Tests/RegexTests/DiagnosticTests.swift @@ -20,7 +20,7 @@ extension RegexTests { XCTAssert(SourceLocation.fake.isFake) XCTAssert(group(.capture, "a").location.isFake) - let ast = try! parse("(a)", .semantic, .traditional).root + let ast = try! parse("(a)", .traditional).root XCTAssert(ast.location.isReal) } @@ -31,7 +31,7 @@ extension RegexTests { // // Input should be a concatenation or alternation func flatTest(_ str: String, _ expected: [String]) { - guard let ast = try? parse(str, .semantic, .traditional).root else { + guard let ast = try? parse(str, .traditional).root else { XCTFail("Fail to parse: \(str)") return } @@ -53,9 +53,7 @@ extension RegexTests { flatTest("a|(b)|", ["a", "(b)", ""]) func renderTest(_ str: String, _ expected: [String]) { - let lines = try! parse( - str, .semantic, .traditional - )._render(in: str) + let lines = try! parse(str, .traditional)._render(in: str) func fail() { XCTFail(""" expected: diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index e28c72514..49184deb3 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -18,47 +18,49 @@ func diagnose( _ input: String, expecting expected: ParseError, _ syntax: SyntaxOptions = .traditional, - _ f: (inout Source) throws -> (), + _ f: (inout Parser) -> (), file: StaticString = #file, line: UInt = #line ) { - var src = Source(input) - do { - try f(&src) + var parser = Parser(Source(input), syntax: syntax) + f(&parser) + + let diags = parser.diags.diags + guard diags.count == 1 else { XCTFail(""" - Passed, but expected error: \(expected) + Expected single diagnostic """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { - XCTFail(""" - - Expected: \(expected) - Actual: \(e.error) - """, file: file, line: line) - return - } - } catch let e { - fatalError("Should be unreachable: \(e)") + return + } + + let error = diags[0].underlyingParseError! + guard error == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(error) + """, file: file, line: line) + return } } extension RegexTests { func testLexicalAnalysis() { - diagnose("a", expecting: .expected("b")) { src in - try src.expect("b") + diagnose("a", expecting: .expected("b")) { p in + p.expect("b") } - diagnose("", expecting: .unexpectedEndOfInput) { src in - try src.expectNonEmpty() + diagnose("", expecting: .unexpectedEndOfInput) { p in + p.expectNonEmpty() } - diagnose("a", expecting: .unexpectedEndOfInput) { src in - try src.expect("a") // Ok - try src.expectNonEmpty() // Error + diagnose("a", expecting: .unexpectedEndOfInput) { p in + p.expect("a") // Ok + p.expectNonEmpty() // Error } let bigNum = "12345678901234567890" - diagnose(bigNum, expecting: .numberOverflow(bigNum)) { src in - _ = try src.lexNumber() + diagnose(bigNum, expecting: .numberOverflow(bigNum)) { p in + _ = p.lexNumber() } // TODO: want to dummy print out source ranges, etc, test that. diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 207d7e13d..51654c057 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -33,56 +33,38 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } } -enum SemanticErrorKind { - case unsupported, invalid, unchecked -} - class RegexTests: XCTestCase {} func parseTest( _ input: String, _ expectedAST: AST.Node, - throwsError errorKind: SemanticErrorKind? = nil, - syntax: SyntaxOptions = .traditional, + throwsError expectedErrors: ParseError..., unsupported: Bool = false, + uncheckedErrors: Bool = false, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { parseTest( - input, .init(expectedAST, globalOptions: nil), throwsError: errorKind, - syntax: syntax, captures: expectedCaptures, file: file, line: line + input, .init(expectedAST, globalOptions: nil, diags: Diagnostics()), + throwsError: expectedErrors, unsupported: unsupported, + uncheckedErrors: uncheckedErrors, syntax: syntax, + captures: expectedCaptures, file: file, line: line ) } func parseTest( _ input: String, _ expectedAST: AST, - throwsError errorKind: SemanticErrorKind? = nil, + throwsError expectedErrors: [ParseError] = [], unsupported: Bool = false, + uncheckedErrors: Bool = false, syntax: SyntaxOptions = .traditional, captures expectedCaptures: CaptureList = [], file: StaticString = #file, line: UInt = #line ) { - let ast: AST - do { - ast = try parse(input, errorKind != nil ? .syntactic : .semantic, syntax) - } catch { - XCTFail("unexpected error: \(error)", file: file, line: line) - return - } - if let errorKind = errorKind, errorKind != .unchecked { - do { - _ = try parse(input, .semantic, syntax) - XCTFail("expected semantically invalid AST", file: file, line: line) - } catch let e as Source.LocatedError { - switch e.error { - case .unsupported: - XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) - default: - XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) - } - } catch { - XCTFail("Error without source location: \(error)", file: file, line: line) - } - } + let ast = parseWithRecovery(input, syntax) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, + unchecked: uncheckedErrors, file: file, line: line + ) guard ast == expectedAST || ast._dump() == expectedAST._dump() // EQ workaround else { @@ -172,42 +154,25 @@ func delimiterLexingTest( /// not considered part of it. func parseWithDelimitersTest( _ input: String, _ expecting: AST.Node, - throwsError errorKind: SemanticErrorKind? = nil, - ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line + throwsError expectedErrors: ParseError..., unsupported: Bool = false, + uncheckedErrors: Bool = false, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - let ast: AST.Node - do { - ast = try parseWithDelimiters( - literal, errorKind != nil ? .syntactic : .semantic).root - } catch { - XCTFail("unexpected error: \(error)", file: file, line: line) - return - } - if let errorKind = errorKind { - do { - _ = try parseWithDelimiters(input, .semantic) - XCTFail("expected semantically invalid AST", file: file, line: line) - } catch let e as Source.LocatedError { - switch e.error { - case .unsupported: - XCTAssertEqual(errorKind, .unsupported, "\(e)", file: file, line: line) - default: - XCTAssertEqual(errorKind, .invalid, "\(e)", file: file, line: line) - } - } catch { - XCTFail("Error without source location: \(error)", file: file, line: line) - } - } - guard ast == expecting - || ast._dump() == expecting._dump() // EQ workaround + let ast = parseWithDelimitersWithRecovery(literal) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, + unchecked: uncheckedErrors, file: file, line: line + ) + guard ast.root == expecting + || ast.root._dump() == expecting._dump() // EQ workaround else { XCTFail(""" Expected: \(expecting._dump()) - Found: \(ast._dump()) + Found: \(ast.root._dump()) """, file: file, line: line) return @@ -220,8 +185,8 @@ func parseNotEqualTest( syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line ) { - let lhsAST = try! parse(lhs, .syntactic, syntax) - let rhsAST = try! parse(rhs, .syntactic, syntax) + let lhsAST = parseWithRecovery(lhs, syntax) + let rhsAST = parseWithRecovery(rhs, syntax) if lhsAST == rhsAST || lhsAST._dump() == rhsAST._dump() { XCTFail(""" AST: \(lhsAST._dump()) @@ -237,7 +202,7 @@ func rangeTest( at locFn: (AST.Node) -> SourceLocation = \.location, file: StaticString = #file, line: UInt = #line ) { - let ast = try! parse(input, .syntactic, syntax).root + let ast = parseWithRecovery(input, syntax).root let range = input.offsets(of: locFn(ast).range) let expected = expectedRange(input) @@ -251,60 +216,70 @@ func rangeTest( } } -func diagnosticTest( - _ input: String, _ expected: ParseError, - syntax: SyntaxOptions = .traditional, - file: StaticString = #file, line: UInt = #line +func matchDiagnostics( + _ expected: [ParseError], for ast: AST, unsupported: Bool, unchecked: Bool, + file: StaticString, line: UInt ) { - do { - let ast = try parse(input, .semantic, syntax) - XCTFail(""" + guard !unchecked else { return } + + var errors = Set() + for diag in ast.diags.diags where diag.isAnyError { + guard let underlying = diag.underlyingParseError else { + XCTFail( + "Unknown error emitted: '\(diag.message)'", file: file, line: line) + continue + } + // TODO: We should be uniquing based on source location, and failing if we + // emit duplicate diagnostics at the same location. + errors.insert(underlying) + } - Passed \(ast) - But expected error: \(expected) - """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { + // Filter out any unsupported errors if needed. + if unsupported { + errors = errors.filter { + if case .unsupported = $0 { return false } else { return true } + } + } + for mismatched in errors.symmetricDifference(expected) { + if errors.contains(mismatched) { + XCTFail(""" + Unexpected error: \(mismatched) + """, file: file, line: line) + } else { XCTFail(""" - Expected: \(expected) - Actual: \(e.error) + Expected error not emitted: \(mismatched) + for AST: \(ast) """, file: file, line: line) - return } - } catch let e { - XCTFail("Error without source location: \(e)", file: file, line: line) } } -func diagnosticWithDelimitersTest( - _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false, +func diagnosticTest( + _ input: String, _ expectedErrors: ParseError..., unsupported: Bool = false, + syntax: SyntaxOptions = .traditional, file: StaticString = #file, line: UInt = #line +) { + let ast = parseWithRecovery(input, syntax) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, unchecked: false, + file: file, line: line + ) +} + +func diagnosticWithDelimitersTest( + _ input: String, _ expectedErrors: ParseError..., unsupported: Bool = false, + ignoreTrailing: Bool = false, file: StaticString = #file, line: UInt = #line ) { // First try lexing. let literal = delimiterLexingTest( input, ignoreTrailing: ignoreTrailing, file: file, line: line) - do { - let orig = try parseWithDelimiters(literal, .semantic) - let ast = orig.root - XCTFail(""" - - Passed \(ast) - But expected error: \(expected) - """, file: file, line: line) - } catch let e as Source.LocatedError { - guard e.error == expected else { - XCTFail(""" - - Expected: \(expected) - Actual: \(e.error) - """, file: file, line: line) - return - } - } catch let e { - XCTFail("Error without source location: \(e)", file: file, line: line) - } + let ast = parseWithDelimitersWithRecovery(literal) + matchDiagnostics( + expectedErrors, for: ast, unsupported: unsupported, unchecked: false, + file: file, line: line + ) } func delimiterLexingDiagnosticTest( @@ -344,6 +319,7 @@ func compilerInterfaceDiagnosticMessageTest( input, captureBufferOut: captureBuffer) XCTFail("Expected parse error", file: file, line: line) } catch let error as CompilerParseError { + XCTAssertNotNil(error.location, "Error without location", file: file, line: line) XCTAssertEqual(expectedErr, error.message, file: file, line: line) } catch { fatalError("Expected CompilerParseError") @@ -505,7 +481,7 @@ extension RegexTests { // FIXME: '\N' should be emitted through 'emitAny', not through the // _CharacterClassModel model. - parseTest(#"\N"#, escaped(.notNewline), throwsError: .unsupported) + parseTest(#"\N"#, escaped(.notNewline), unsupported: true) parseTest(#"\R"#, escaped(.newlineSequence)) @@ -680,12 +656,12 @@ extension RegexTests { range_m(.keyboardControl("A"), .keyboardControl("B")), range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), range_m(.keyboardMeta("A"), .keyboardMeta("B")) - ), throwsError: .unsupported) + ), unsupported: true) parseTest( #"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))), - throwsError: .unsupported) + unsupported: true) parseTest( #"[\u{AA}-\u{BB}]"#, @@ -696,17 +672,17 @@ extension RegexTests { parseTest( #"[\u{AA BB}-\u{CC}]"#, charClass(range_m(scalarSeq_a("\u{AA}", "\u{BB}"), scalar_a("\u{CC}"))), - throwsError: .unsupported + unsupported: true ) parseTest( #"[\u{CC}-\u{AA BB}]"#, charClass(range_m(scalar_a("\u{CC}"), scalarSeq_a("\u{AA}", "\u{BB}"))), - throwsError: .unsupported + unsupported: true ) parseTest( #"[\u{a b c}]"#, charClass(scalarSeq_m("\u{A}", "\u{B}", "\u{C}")), - throwsError: .unsupported + unsupported: true ) parseTest(#"(?x)[ a - b ]"#, concat( @@ -822,13 +798,13 @@ extension RegexTests { parseTest(#"\\#u{3000}"#, "\u{3000}") // Control and meta controls. - parseTest(#"\c "#, atom(.keyboardControl(" ")), throwsError: .unsupported) - parseTest(#"\c!"#, atom(.keyboardControl("!")), throwsError: .unsupported) - parseTest(#"\c~"#, atom(.keyboardControl("~")), throwsError: .unsupported) - parseTest(#"\C--"#, atom(.keyboardControl("-")), throwsError: .unsupported) - parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), throwsError: .unsupported) - parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), throwsError: .unsupported) - parseTest(#"\M-a"#, atom(.keyboardMeta("a")), throwsError: .unsupported) + parseTest(#"\c "#, atom(.keyboardControl(" ")), unsupported: true) + parseTest(#"\c!"#, atom(.keyboardControl("!")), unsupported: true) + parseTest(#"\c~"#, atom(.keyboardControl("~")), unsupported: true) + parseTest(#"\C--"#, atom(.keyboardControl("-")), unsupported: true) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")), unsupported: true) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")), unsupported: true) + parseTest(#"\M-a"#, atom(.keyboardMeta("a")), unsupported: true) // MARK: Comments @@ -933,11 +909,11 @@ extension RegexTests { // Balanced captures parseTest(#"(?)"#, balancedCapture(name: "a", priorName: "c", empty()), - throwsError: .unsupported, captures: [.named("a")]) + unsupported: true, captures: [.named("a")]) parseTest(#"(?<-c>)"#, balancedCapture(name: nil, priorName: "c", empty()), - throwsError: .unsupported, captures: [.cap]) + unsupported: true, captures: [.cap]) parseTest(#"(?'a-b'c)"#, balancedCapture(name: "a", priorName: "b", "c"), - throwsError: .unsupported, captures: [.named("a")]) + unsupported: true, captures: [.named("a")]) // Capture resets. // FIXME: The captures in each branch should be unified. For now, we don't @@ -945,29 +921,30 @@ extension RegexTests { parseTest( "(?|(a)|(b))", nonCaptureReset(alt(capture("a"), capture("b"))), - throwsError: .unsupported, captures: [.opt, .opt] + unsupported: true, captures: [.opt, .opt] ) parseTest( "(?|(?a)|(b))", nonCaptureReset(alt(namedCapture("x", "a"), capture("b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .opt] + unsupported: true, captures: [.named("x", opt: 1), .opt] ) parseTest( "(?|(a)|(?b))", nonCaptureReset(alt(capture("a"), namedCapture("x", "b"))), - throwsError: .unsupported, captures: [.opt, .named("x", opt: 1)] + unsupported: true, captures: [.opt, .named("x", opt: 1)] ) parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("x", "b"))), - throwsError: .invalid, captures: [.named("x", opt: 1), .named("x", opt: 1)] + throwsError: .duplicateNamedCapture("x"), unsupported: true, + captures: [.named("x", opt: 1), .named("x", opt: 1)] ) // TODO: Reject mismatched names? parseTest( "(?|(?a)|(?b))", nonCaptureReset(alt(namedCapture("x", "a"), namedCapture("y", "b"))), - throwsError: .unsupported, captures: [.named("x", opt: 1), .named("y", opt: 1)] + unsupported: true, captures: [.named("x", opt: 1), .named("y", opt: 1)] ) // Other groups @@ -976,7 +953,7 @@ extension RegexTests { concat("a", nonCapture("b"), "c")) parseTest( #"a(?|b)c"#, - concat("a", nonCaptureReset("b"), "c"), throwsError: .unsupported) + concat("a", nonCaptureReset("b"), "c"), unsupported: true) parseTest( #"a(?>b)c"#, concat("a", atomicNonCapturing("b"), "c")) @@ -994,41 +971,41 @@ extension RegexTests { concat("a", negativeLookahead("b"), "c")) parseTest("a(?<=b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(*plb:b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(*positive_lookbehind:b)c", - concat("a", lookbehind("b"), "c"), throwsError: .unsupported) + concat("a", lookbehind("b"), "c"), unsupported: true) parseTest("a(?"#, backreference(.relative(4)), throwsError: .unsupported) - parseTest(#"\k<2>"#, backreference(.absolute(2)), throwsError: .invalid) - parseTest(#"\k'-3'"#, backreference(.relative(-3)), throwsError: .unsupported) - parseTest(#"\k'1'"#, backreference(.absolute(1)), throwsError: .invalid) + parseTest(#"\113"#, backreference(ref(113)), throwsError: .invalidReference(113)) + parseTest(#"\377"#, backreference(ref(377)), throwsError: .invalidReference(377)) + parseTest(#"\81"#, backreference(ref(81)), throwsError: .invalidReference(81)) + + parseTest(#"\g1"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g001"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g52"#, backreference(ref(52)), throwsError: .invalidReference(52)) + parseTest(#"\g-01"#, backreference(ref(minus: 1)), unsupported: true) + parseTest(#"\g+30"#, backreference(ref(plus: 30)), unsupported: true) + + parseTest(#"\g{1}"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g{001}"#, backreference(ref(1)), throwsError: .invalidReference(1)) + parseTest(#"\g{52}"#, backreference(ref(52)), throwsError: .invalidReference(52)) + parseTest(#"\g{-01}"#, backreference(ref(minus: 1)), unsupported: true) + parseTest(#"\g{+30}"#, backreference(ref(plus: 30)), unsupported: true) + parseTest(#"\k<+4>"#, backreference(ref(plus: 4)), unsupported: true) + parseTest(#"\k<2>"#, backreference(ref(2)), throwsError: .invalidReference(2)) + parseTest(#"\k'-3'"#, backreference(ref(minus: 3)), unsupported: true) + parseTest(#"\k'1'"#, backreference(ref(1)), throwsError: .invalidReference(1)) parseTest( #"(?)\k"#, concat( @@ -1307,52 +1284,60 @@ extension RegexTests { ), captures: [.named("a")] ) - parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalid) - parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalid) - parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalid) - parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalid) + parseTest(#"\k{a0}"#, backreference(.named("a0")), throwsError: .invalidNamedReference("a0")) + parseTest(#"\k"#, backreference(.named("bc")), throwsError: .invalidNamedReference("bc")) + parseTest(#"\g{abc}"#, backreference(.named("abc")), throwsError: .invalidNamedReference("abc")) + parseTest(#"(?P=abc)"#, backreference(.named("abc")), throwsError: .invalidNamedReference("abc")) // Oniguruma recursion levels. - parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), throwsError: .unsupported) - parseTest(#"\k<1+1>"#, backreference(.absolute(1), recursionLevel: 1), throwsError: .unsupported) - parseTest(#"\k<3-8>"#, backreference(.absolute(3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'-3-8'"#, backreference(.relative(-3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3-8'"#, backreference(.relative(3), recursionLevel: -8), throwsError: .unsupported) - parseTest(#"\k'+3+8'"#, backreference(.relative(3), recursionLevel: 8), throwsError: .unsupported) - - parseTest(#"(?R)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) - parseTest(#"(?0)"#, subpattern(.recurseWholePattern), throwsError: .unsupported) - parseTest(#"(?1)"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"(?+12)"#, subpattern(.relative(12)), throwsError: .unsupported) - parseTest(#"(?-2)"#, subpattern(.relative(-2)), throwsError: .unsupported) - parseTest(#"(?&hello)"#, subpattern(.named("hello")), throwsError: .unsupported) - parseTest(#"(?P>P)"#, subpattern(.named("P")), throwsError: .unsupported) + parseTest(#"\k"#, backreference(.named("bc"), recursionLevel: 0), + throwsError: .invalidNamedReference("bc"), unsupported: true) + parseTest(#"\k"#, backreference(.named("a"), recursionLevel: 0), + throwsError: .invalidNamedReference("a"), unsupported: true) + parseTest(#"\k<1+1>"#, backreference(ref(1), recursionLevel: 1), + throwsError: .invalidReference(1), unsupported: true) + parseTest(#"\k<3-8>"#, backreference(ref(3), recursionLevel: -8), + throwsError: .invalidReference(3), unsupported: true) + parseTest(#"\k'-3-8'"#, backreference(ref(minus: 3), recursionLevel: -8), + unsupported: true) + parseTest(#"\k'bc-8'"#, backreference(.named("bc"), recursionLevel: -8), + throwsError: .invalidNamedReference("bc"), unsupported: true) + parseTest(#"\k'+3-8'"#, backreference(ref(plus: 3), recursionLevel: -8), + unsupported: true) + parseTest(#"\k'+3+8'"#, backreference(ref(plus: 3), recursionLevel: 8), + unsupported: true) + + parseTest(#"(?R)"#, subpattern(ref(0)), unsupported: true) + parseTest(#"(?0)"#, subpattern(ref(0)), unsupported: true) + parseTest(#"(?1)"#, subpattern(ref(1)), unsupported: true) + parseTest(#"(?+12)"#, subpattern(ref(plus: 12)), unsupported: true) + parseTest(#"(?-2)"#, subpattern(ref(minus: 2)), unsupported: true) + parseTest(#"(?&hello)"#, subpattern(.named("hello")), unsupported: true) + parseTest(#"(?P>P)"#, subpattern(.named("P")), unsupported: true) parseTest(#"[(?R)]"#, charClass("(", "?", "R", ")")) parseTest(#"[(?&a)]"#, charClass("(", "?", "&", "a", ")")) parseTest(#"[(?1)]"#, charClass("(", "?", "1", ")")) - parseTest(#"\g<1>"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"\g<001>"#, subpattern(.absolute(1)), throwsError: .unsupported) - parseTest(#"\g'52'"#, subpattern(.absolute(52)), throwsError: .unsupported) - parseTest(#"\g'-01'"#, subpattern(.relative(-1)), throwsError: .unsupported) - parseTest(#"\g'+30'"#, subpattern(.relative(30)), throwsError: .unsupported) - parseTest(#"\g'abc'"#, subpattern(.named("abc")), throwsError: .unsupported) + parseTest(#"\g<1>"#, subpattern(ref(1)), unsupported: true) + parseTest(#"\g<001>"#, subpattern(ref(1)), unsupported: true) + parseTest(#"\g'52'"#, subpattern(ref(52)), unsupported: true) + parseTest(#"\g'-01'"#, subpattern(ref(minus: 1)), unsupported: true) + parseTest(#"\g'+30'"#, subpattern(ref(plus: 30)), unsupported: true) + parseTest(#"\g'abc'"#, subpattern(.named("abc")), unsupported: true) // These are valid references. parseTest(#"()\1"#, concat( - capture(empty()), backreference(.absolute(1)) + capture(empty()), backreference(ref(1)) ), captures: [.cap]) parseTest(#"\1()"#, concat( - backreference(.absolute(1)), capture(empty()) + backreference(ref(1)), capture(empty()) ), captures: [.cap]) parseTest(#"()()\2"#, concat( - capture(empty()), capture(empty()), backreference(.absolute(2)) + capture(empty()), capture(empty()), backreference(ref(2)) ), captures: [.cap, .cap]) parseTest(#"()\2()"#, concat( - capture(empty()), backreference(.absolute(2)), capture(empty()) + capture(empty()), backreference(ref(2)), capture(empty()) ), captures: [.cap, .cap]) // MARK: Character names. @@ -1362,7 +1347,7 @@ extension RegexTests { parseTest(#"\N{abc}+"#, oneOrMore(of: atom(.namedCharacter("abc")))) parseTest( #"\N {2}"#, - concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), throwsError: .unsupported + concat(atom(.escaped(.notNewline)), exactly(2, of: " ")), unsupported: true ) parseTest(#"\N{AA}"#, atom(.namedCharacter("AA"))) @@ -1428,12 +1413,12 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), throwsError: .unsupported) + parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), unsupported: true) parseTest(#"\p{Hebrew}"#, prop(.scriptExtension(.hebrew))) parseTest(#"\p{Is_Hebrew}"#, prop(.scriptExtension(.hebrew))) - parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) - parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), unsupported: true) + parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), unsupported: true) // These are the shorthand properties with an "in" prefix we currently // recognize. Make sure they don't clash with block properties. @@ -1456,38 +1441,38 @@ extension RegexTests { parseTest(#"\p{is\#(p.rawValue)}"#, prop(.posix(p))) } for b in Unicode.BinaryProperty.allCases { - // Some of these are unsupported, so don't check for semantic errors. - parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) - parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + // Some of these are unsupported, so don't check for errors. + parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), uncheckedErrors: true) + parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), uncheckedErrors: true) } for j in AST.Atom.CharacterProperty.JavaSpecial.allCases { - parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported) + parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), unsupported: true) } // Try prefixing each block property with "in" to make sure we don't stomp // on any other property shorthands. for b in Unicode.Block.allCases { - parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), throwsError: .unsupported) + parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), unsupported: true) } parseTest(#"\p{ASCII}"#, prop(.ascii)) parseTest(#"\p{isASCII}"#, prop(.ascii)) - parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), unsupported: true) - parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) - parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) + parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), unsupported: true) parseTest(#"\p{isAny}"#, prop(.any)) parseTest(#"\p{isAssigned}"#, prop(.assigned)) - parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) - parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) - parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), throwsError: .unsupported) - parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), throwsError: .unsupported) - parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), throwsError: .unsupported) + parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), unsupported: true) + parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), unsupported: true) + parseTest(#"\p{Xsp}"#, prop(.pcreSpecial(.perlSpace)), unsupported: true) + parseTest(#"\p{Xuc}"#, prop(.pcreSpecial(.universallyNamed)), unsupported: true) + parseTest(#"\p{Xwd}"#, prop(.pcreSpecial(.perlWord)), unsupported: true) parseTest(#"\p{alnum}"#, prop(.posix(.alnum))) parseTest(#"\p{is_alnum}"#, prop(.posix(.alnum))) @@ -1507,45 +1492,45 @@ extension RegexTests { // MARK: Conditionals parseTest(#"(?(1))"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(1)|)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(1)a)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), unsupported: true) parseTest(#"(?(1)a|)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: empty()), unsupported: true) parseTest(#"(?(1)|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: "b"), unsupported: true) parseTest(#"(?(1)a|b)"#, conditional( - .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(1)(a|b|c)|d)"#, conditional( .groupMatched(ref(1)), trueBranch: capture(alt("a", "b", "c")), falseBranch: "d" - ), throwsError: .unsupported, captures: [.opt]) + ), unsupported: true, captures: [.opt]) parseTest(#"(?(+3))"#, conditional( - .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(plus: 3)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(-21))"#, conditional( - .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupMatched(ref(minus: 21)), trueBranch: empty(), falseBranch: empty()), unsupported: true) // Oniguruma recursion levels. parseTest(#"(?(1+1))"#, conditional( .groupMatched(ref(1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(-1+1))"#, conditional( .groupMatched(ref(minus: 1, recursionLevel: 1)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(1-3))"#, conditional( .groupMatched(ref(1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(+1-3))"#, conditional( .groupMatched(ref(plus: 1, recursionLevel: -3)), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest( #"(?)(?(a+5))"#, @@ -1553,7 +1538,7 @@ extension RegexTests { .groupMatched(ref("a", recursionLevel: 5)), trueBranch: empty(), falseBranch: empty() )), - throwsError: .unsupported, captures: [.named("a")] + unsupported: true, captures: [.named("a")] ) parseTest( #"(?)(?(a1-5))"#, @@ -1561,50 +1546,50 @@ extension RegexTests { .groupMatched(ref("a1", recursionLevel: -5)), trueBranch: empty(), falseBranch: empty() )), - throwsError: .unsupported, captures: [.named("a1")] + unsupported: true, captures: [.named("a1")] ) parseTest(#"(?(1))?"#, zeroOrOne(of: conditional( - .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), throwsError: .unsupported) + .groupMatched(ref(1)), trueBranch: empty(), falseBranch: empty())), unsupported: true) parseTest(#"(?(R)a|b)"#, conditional( - .recursionCheck, trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .recursionCheck, trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(R1))"#, conditional( - .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .groupRecursionCheck(ref(1)), trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(R&abc)a|b)"#, conditional( - .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupRecursionCheck(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?()a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?('abc')a|b)"#, conditional( - .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), throwsError: .unsupported) + .groupMatched(ref("abc")), trueBranch: "a", falseBranch: "b"), unsupported: true) parseTest(#"(?(abc)a|b)"#, conditional( groupCondition(.capture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported, captures: [.cap]) + ), unsupported: true, captures: [.cap]) parseTest(#"(?(?:abc)a|b)"#, conditional( groupCondition(.nonCapture, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?=abc)a|b)"#, conditional( groupCondition(.lookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?!abc)a|b)"#, conditional( groupCondition(.negativeLookahead, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?<=abc)a|b)"#, conditional( groupCondition(.lookbehind, concat("a", "b", "c")), trueBranch: "a", falseBranch: "b" - ), throwsError: .unsupported) + ), unsupported: true) parseTest(#"(?(?y)(?(xxx)a|b)"#, concat( namedCapture("xxx", "y"), conditional(.groupMatched(ref("xxx")), trueBranch: "a", falseBranch: "b") - ), throwsError: .unsupported, captures: [.named("xxx")]) + ), unsupported: true, captures: [.named("xxx")]) parseTest(#"(?(1)(?(2)(?(3)))|a)"#, conditional( .groupMatched(ref(1)), @@ -1634,119 +1619,119 @@ extension RegexTests { trueBranch: empty(), falseBranch: empty()), falseBranch: empty()), - falseBranch: "a"), throwsError: .unsupported) + falseBranch: "a"), unsupported: true) parseTest(#"(?(DEFINE))"#, conditional( - .defineGroup, trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported) + .defineGroup, trueBranch: empty(), falseBranch: empty()), unsupported: true) parseTest(#"(?(VERSION>=3.1))"#, conditional( pcreVersionCheck(.greaterThanOrEqual, 3, 1), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) parseTest(#"(?(VERSION=0.1))"#, conditional( pcreVersionCheck(.equal, 0, 1), - trueBranch: empty(), falseBranch: empty()), throwsError: .unsupported + trueBranch: empty(), falseBranch: empty()), unsupported: true ) // MARK: Callouts // PCRE callouts - parseTest(#"(?C)"#, pcreCallout(.number(0)), throwsError: .unsupported) - parseTest(#"(?C0)"#, pcreCallout(.number(0)), throwsError: .unsupported) - parseTest(#"(?C20)"#, pcreCallout(.number(20)), throwsError: .unsupported) - parseTest("(?C{abc})", pcreCallout(.string("abc")), throwsError: .unsupported) + parseTest(#"(?C)"#, pcreCallout(number: 0), unsupported: true) + parseTest(#"(?C0)"#, pcreCallout(number: 0), unsupported: true) + parseTest(#"(?C20)"#, pcreCallout(number: 20), unsupported: true) + parseTest("(?C{abc})", pcreCallout(string: "abc"), unsupported: true) for delim in ["`", "'", "\"", "^", "%", "#", "$"] { - parseTest("(?C\(delim)hello\(delim))", pcreCallout(.string("hello")), - throwsError: .unsupported) + parseTest("(?C\(delim)hello\(delim))", pcreCallout(string: "hello"), + unsupported: true) } // Oniguruma named callouts - parseTest("(*X)", onigurumaNamedCallout("X"), throwsError: .unsupported) - parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), throwsError: .unsupported) - parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), throwsError: .unsupported) - parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), throwsError: .unsupported) - parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), throwsError: .unsupported) - parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), throwsError: .unsupported) - parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), throwsError: .unsupported) - parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), throwsError: .unsupported) + parseTest("(*X)", onigurumaNamedCallout("X"), unsupported: true) + parseTest("(*foo[t])", onigurumaNamedCallout("foo", tag: "t"), unsupported: true) + parseTest("(*foo[a0]{b})", onigurumaNamedCallout("foo", tag: "a0", args: "b"), unsupported: true) + parseTest("(*foo{b})", onigurumaNamedCallout("foo", args: "b"), unsupported: true) + parseTest("(*foo[a]{a,b,c})", onigurumaNamedCallout("foo", tag: "a", args: "a", "b", "c"), unsupported: true) + parseTest("(*foo{a,b,c})", onigurumaNamedCallout("foo", args: "a", "b", "c"), unsupported: true) + parseTest("(*foo{%%$,!!,>>})", onigurumaNamedCallout("foo", args: "%%$", "!!", ">>"), unsupported: true) + parseTest("(*foo{a, b, c})", onigurumaNamedCallout("foo", args: "a", " b", " c"), unsupported: true) // Oniguruma 'of contents' callouts - parseTest("(?{x})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), throwsError: .unsupported) - parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), throwsError: .unsupported) - parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), throwsError: .unsupported) - parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), throwsError: .unsupported) - parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), throwsError: .unsupported) - parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), throwsError: .unsupported) - parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), throwsError: .unsupported) + parseTest("(?{x})", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{{{x}}y}}})", onigurumaCalloutOfContents("x}}y"), unsupported: true) + parseTest("(?{{{x}}})", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{x}[tag])", onigurumaCalloutOfContents("x", tag: "tag"), unsupported: true) + parseTest("(?{x}[tag]<)", onigurumaCalloutOfContents("x", tag: "tag", direction: .inRetraction), unsupported: true) + parseTest("(?{x}X)", onigurumaCalloutOfContents("x", direction: .both), unsupported: true) + parseTest("(?{x}>)", onigurumaCalloutOfContents("x"), unsupported: true) + parseTest("(?{\\x})", onigurumaCalloutOfContents("\\x"), unsupported: true) + parseTest("(?{\\})", onigurumaCalloutOfContents("\\"), unsupported: true) // MARK: Backtracking directives - parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), throwsError: .unsupported) + parseTest("(*ACCEPT)?", zeroOrOne(of: backtrackingDirective(.accept)), unsupported: true) parseTest( "(*ACCEPT:a)??", zeroOrOne(.reluctant, of: backtrackingDirective(.accept, name: "a")), - throwsError: .unsupported + unsupported: true ) - parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) - parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), throwsError: .unsupported) - parseTest("(*F)", backtrackingDirective(.fail), throwsError: .unsupported) - parseTest("(*COMMIT)", backtrackingDirective(.commit), throwsError: .unsupported) - parseTest("(*SKIP)", backtrackingDirective(.skip), throwsError: .unsupported) - parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), throwsError: .unsupported) - parseTest("(*PRUNE)", backtrackingDirective(.prune), throwsError: .unsupported) - parseTest("(*THEN)", backtrackingDirective(.then), throwsError: .unsupported) + parseTest("(*:a)", backtrackingDirective(.mark, name: "a"), unsupported: true) + parseTest("(*MARK:a)", backtrackingDirective(.mark, name: "a"), unsupported: true) + parseTest("(*F)", backtrackingDirective(.fail), unsupported: true) + parseTest("(*COMMIT)", backtrackingDirective(.commit), unsupported: true) + parseTest("(*SKIP)", backtrackingDirective(.skip), unsupported: true) + parseTest("(*SKIP:SKIP)", backtrackingDirective(.skip, name: "SKIP"), unsupported: true) + parseTest("(*PRUNE)", backtrackingDirective(.prune), unsupported: true) + parseTest("(*THEN)", backtrackingDirective(.then), unsupported: true) // MARK: Oniguruma absent functions - parseTest("(?~)", absentRepeater(empty()), throwsError: .unsupported) - parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), throwsError: .unsupported) - parseTest("(?~~)", absentRepeater("~"), throwsError: .unsupported) - parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~(a))", absentRepeater(capture("a")), throwsError: .unsupported, captures: []) - parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), throwsError: .unsupported) - - parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), throwsError: .unsupported) - parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), throwsError: .unsupported) - parseTest("(?~|~)", absentStopper("~"), throwsError: .unsupported) - parseTest("(?~|(a))", absentStopper(capture("a")), throwsError: .unsupported, captures: []) - parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), throwsError: .unsupported) - - parseTest("(?~|a|b)", absentExpression("a", "b"), throwsError: .unsupported) - parseTest("(?~|~|~)", absentExpression("~", "~"), throwsError: .unsupported) + parseTest("(?~)", absentRepeater(empty()), unsupported: true) + parseTest("(?~abc)", absentRepeater(concat("a", "b", "c")), unsupported: true) + parseTest("(?~a+)", absentRepeater(oneOrMore(of: "a")), unsupported: true) + parseTest("(?~~)", absentRepeater("~"), unsupported: true) + parseTest("(?~a|b|c)", absentRepeater(alt("a", "b", "c")), unsupported: true) + parseTest("(?~(a))", absentRepeater(capture("a")), unsupported: true, captures: []) + parseTest("(?~)*", zeroOrMore(of: absentRepeater(empty())), unsupported: true) + + parseTest("(?~|abc)", absentStopper(concat("a", "b", "c")), unsupported: true) + parseTest("(?~|a+)", absentStopper(oneOrMore(of: "a")), unsupported: true) + parseTest("(?~|~)", absentStopper("~"), unsupported: true) + parseTest("(?~|(a))", absentStopper(capture("a")), unsupported: true, captures: []) + parseTest("(?~|a){2}", exactly(2, of: absentStopper("a")), unsupported: true) + + parseTest("(?~|a|b)", absentExpression("a", "b"), unsupported: true) + parseTest("(?~|~|~)", absentExpression("~", "~"), unsupported: true) parseTest("(?~|(a)|(?:b))", absentExpression(capture("a"), nonCapture("b")), - throwsError: .unsupported, captures: []) + unsupported: true, captures: []) parseTest("(?~|(a)|(?:(b)|c))", absentExpression( capture("a"), nonCapture(alt(capture("b"), "c")) - ), throwsError: .unsupported, captures: [.opt]) - parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), throwsError: .unsupported) + ), unsupported: true, captures: [.opt]) + parseTest("(?~|a|b)?", zeroOrOne(of: absentExpression("a", "b")), unsupported: true) - parseTest("(?~|)", absentRangeClear(), throwsError: .unsupported) + parseTest("(?~|)", absentRangeClear(), unsupported: true) // TODO: It's not really clear what this means, but Oniguruma parses it... // Maybe we should diagnose it? - parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), throwsError: .unsupported) + parseTest("(?~|)+", oneOrMore(of: absentRangeClear()), unsupported: true) // MARK: Global matching options parseTest("(*CR)(*UTF)(*LIMIT_DEPTH=3)", ast( empty(), opts: .newlineMatching(.carriageReturnOnly), .utfMode, - .limitDepth(.init(faking: 3)) - ), throwsError: .unsupported) + .limitDepth(.init(3, at: .fake)) + ), unsupported: true) parseTest( "(*BSR_UNICODE)3", ast("3", opts: .newlineSequenceMatching(.anyUnicode)), - throwsError: .unsupported) + unsupported: true) parseTest( "(*BSR_ANYCRLF)", ast( empty(), opts: .newlineSequenceMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported) + unsupported: true) // TODO: Diagnose on multiple line matching modes? parseTest( @@ -1754,7 +1739,7 @@ extension RegexTests { ast(empty(), opts: [ .carriageReturnOnly, .linefeedOnly, .carriageAndLinefeedOnly, .anyCarriageReturnOrLinefeed, .anyUnicode, .nulCharacter - ].map { .newlineMatching($0) }), throwsError: .unsupported) + ].map { .newlineMatching($0) }), unsupported: true) parseTest( """ @@ -1763,11 +1748,11 @@ extension RegexTests { (*NO_START_OPT)(*UTF)(*UCP)a """, ast("a", opts: - .limitDepth(.init(faking: 3)), .limitHeap(.init(faking: 1)), - .limitMatch(.init(faking: 2)), .notEmpty, .notEmptyAtStart, + .limitDepth(.init(3, at: .fake)), .limitHeap(.init(1, at: .fake)), + .limitMatch(.init(2, at: .fake)), .notEmpty, .notEmptyAtStart, .noAutoPossess, .noDotStarAnchor, .noJIT, .noStartOpt, .utfMode, .unicodeProperties - ), throwsError: .unsupported + ), unsupported: true ) parseTest("[(*CR)]", charClass("(", "*", "C", "R", ")")) @@ -1987,7 +1972,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -1998,7 +1983,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2009,7 +1994,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2020,7 +2005,7 @@ extension RegexTests { # h """, ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2031,7 +2016,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2042,7 +2027,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2053,7 +2038,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2064,7 +2049,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2075,7 +2060,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2096,7 +2081,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2107,7 +2092,7 @@ extension RegexTests { # h """, ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2118,7 +2103,7 @@ extension RegexTests { # h """, ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseTest( """ @@ -2132,7 +2117,7 @@ extension RegexTests { opts: .newlineMatching(.carriageReturnOnly), .newlineMatching(.nulCharacter) ), - throwsError: .unsupported, syntax: .extendedSyntax + unsupported: true, syntax: .extendedSyntax ) parseWithDelimitersTest( @@ -2305,7 +2290,7 @@ extension RegexTests { parseWithDelimitersTest( #"re'(?'a_bcA0-c1A'x*)'"#, balancedCapture(name: "a_bcA0", priorName: "c1A", zeroOrMore(of: "x")), - throwsError: .unsupported) + unsupported: true) parseWithDelimitersTest( #"rx' (?'a_bcA0' a b)'"#, concat(namedCapture("a_bcA0", concat("a", "b")))) @@ -2313,28 +2298,28 @@ extension RegexTests { parseWithDelimitersTest( #"re'(?('a_bcA0')x|y)'"#, conditional( .groupMatched(ref("a_bcA0")), trueBranch: "x", falseBranch: "y"), - throwsError: .unsupported + unsupported: true ) parseWithDelimitersTest( #"re'(?('+20')\')'"#, conditional( .groupMatched(ref(plus: 20)), trueBranch: "'", falseBranch: empty()), - throwsError: .unsupported + unsupported: true ) parseWithDelimitersTest( - #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalid) + #"re'a\k'b0A''"#, concat("a", backreference(.named("b0A"))), throwsError: .invalidNamedReference("b0A")) parseWithDelimitersTest( - #"re'\k'+2-1''"#, backreference(.relative(2), recursionLevel: -1), - throwsError: .unsupported + #"re'\k'+2-1''"#, backreference(ref(plus: 2), recursionLevel: -1), + unsupported: true ) parseWithDelimitersTest( - #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), throwsError: .unsupported) + #"re'a\g'b0A''"#, concat("a", subpattern(.named("b0A"))), unsupported: true) parseWithDelimitersTest( - #"re'\g'-1'\''"#, concat(subpattern(.relative(-1)), "'"), throwsError: .unsupported) + #"re'\g'-1'\''"#, concat(subpattern(ref(minus: 1)), "'"), unsupported: true) parseWithDelimitersTest( - #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(.string(#"a*b\c 🔥_ ;"#)), - throwsError: .unsupported) + #"re'(?C'a*b\c 🔥_ ;')'"#, pcreCallout(string: #"a*b\c 🔥_ ;"#), + unsupported: true) // Fine, because we don't end up skipping. delimiterLexingTest(#"re'(?'"#) @@ -2369,6 +2354,8 @@ extension RegexTests { parseNotEqualTest(#"abc"#, #"abd"#) parseNotEqualTest(#" "#, #""#) + parseNotEqualTest(#"a{2}"#, #"a{3}"#) + parseNotEqualTest(#"[\p{Any}]"#, #"[[:Any:]]"#) parseNotEqualTest(#"\u{A}"#, #"\u{B}"#) @@ -2622,6 +2609,157 @@ extension RegexTests { rangeTest("(?~|a|b)", entireRange) } + func testParseRecovery() { + // MARK: Groups + + parseTest( + "(", capture(empty()), + throwsError: .expected(")"), captures: [.cap] + ) + parseTest( + "(abc", capture(concat("a", "b", "c")), + throwsError: .expected(")"), captures: [.cap] + ) + parseTest("(?", nonCapture(empty()), throwsError: .expectedGroupSpecifier, .expected(")")) + parseTest("(?:", nonCapture(empty()), throwsError: .expected(")")) + + parseTest( + "(?<", namedCapture("", empty()), + throwsError: .expectedIdentifier(.groupName), .expected(">"), .expected(")"), + captures: [.named("")] + ) + parseTest( + "(?"), .expected(")"), + captures: [.named("a")] + ) + + // MARK: Character classes + + parseTest("[", charClass(), throwsError: .expectedCustomCharacterClassMembers, .expected("]")) + parseTest("[^", charClass(inverted: true), throwsError: .expectedCustomCharacterClassMembers, .expected("]")) + parseTest("[a", charClass("a"), throwsError: .expected("]")) + + parseTest( + "[a&&", charClass(setOp("a", op: .intersection)), + throwsError: .expectedCustomCharacterClassMembers, .expected("]") + ) + parseTest( + "[a&&b", charClass(setOp("a", op: .intersection, "b")), + throwsError: .expected("]") + ) + + diagnosticTest("[:a", .expected("]")) + diagnosticTest("[:a:", .expected("]")) + diagnosticTest("[[:a", .expected("]")) + diagnosticTest("[[:a:", .expected("]")) + diagnosticTest("[[:a[:]", .expected("]")) + + diagnosticTest("[::]", .emptyProperty) + diagnosticTest("[:=:]", .emptyProperty) + diagnosticTest("[[::]]", .emptyProperty) + diagnosticTest("[[:=:]]", .emptyProperty) + + // MARK: Unicode Scalars + + parseTest(#"\u{"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\u{ "#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\u{5"#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\x{5"#, scalar("\u{5}"), throwsError: .expected("}")) + + parseTest(#"\u{ 5"#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 "#, scalar("\u{5}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 6"#, scalarSeq("\u{5}", "\u{6}"), throwsError: .expected("}")) + parseTest(#"\u{ 5 6 "#, scalarSeq("\u{5}", "\u{6}"), throwsError: .expected("}")) + + parseTest(#"\x{"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + + parseTest(#"\u{ A H }"#, scalarSeq("\u{A}", "\u{0}"), throwsError: .expectedNumber("H", kind: .hex)) + + parseTest(#"\uABC"#, scalar("\u{ABC}"), throwsError: .expectedNumDigits("ABC", 4)) + + // MARK: Named characters + + parseTest(#"\N{"#, atom(.namedCharacter("")), throwsError: .expected("}")) + parseTest(#"\N{a"#, atom(.namedCharacter("a")), throwsError: .expected("}")) + parseTest(#"\N{U"#, atom(.namedCharacter("U")), throwsError: .expected("}")) + parseTest(#"\N{U+"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex), .expected("}")) + parseTest(#"\N{U+A"#, scalar("\u{A}"), throwsError: .expected("}")) + parseTest(#"\N{U+}"#, scalar("\u{0}"), throwsError: .expectedNumber("", kind: .hex)) + + // MARK: Character properties + + parseTest( + #"\p{"#, prop(.invalid(key: nil, value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{a"#, prop(.invalid(key: nil, value: "a")), + throwsError: .unknownProperty(key: nil, value: "a"), .expected("}") + ) + parseTest( + #"\p{a="#, prop(.invalid(key: "a", value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{a=b"#, prop(.invalid(key: "a", value: "b")), + throwsError: .unknownProperty(key: "a", value: "b"), .expected("}") + ) + parseTest( + #"\p{sc"#, prop(.generalCategory(.currencySymbol)), + throwsError: .expected("}") + ) + parseTest( + #"\p{sc="#, prop(.invalid(key: "sc", value: "")), + throwsError: .emptyProperty, .expected("}") + ) + parseTest( + #"\p{sc=a"#, prop(.invalid(key: "sc", value: "a")), + throwsError: .unrecognizedScript("a"), .expected("}") + ) + + // MARK: Matching options + + parseTest( + #"(?^"#, changeMatchingOptions(unsetMatchingOptions(), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?x"#, changeMatchingOptions(matchingOptions(adding: .extended), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?xi"#, changeMatchingOptions(matchingOptions(adding: .extended, .caseInsensitive), empty()), + throwsError: .expected(")") + ) + parseTest( + #"(?xi-"#, changeMatchingOptions( + matchingOptions(adding: .extended, .caseInsensitive), empty() + ), + throwsError: .expected(")") + ) + parseTest( + #"(?xi-n"#, changeMatchingOptions( + matchingOptions(adding: .extended, .caseInsensitive, removing: .namedCapturesOnly), + empty() + ), + throwsError: .expected(")") + ) + parseTest( + #"(?xz"#, changeMatchingOptions(matchingOptions(adding: .extended), "z"), + throwsError: .invalidMatchingOption("z"), .expected(")") + ) + parseTest( + #"(?x:"#, changeMatchingOptions(matchingOptions(adding: .extended), empty()), + throwsError: .expected(")") + ) + + // MARK: Invalid values + + parseTest("a{9999999999999999999999999999}", exactly(nil, of: "a"), + throwsError: .numberOverflow("9999999999999999999999999999")) + } + func testParseErrors() { // MARK: Unbalanced delimiters. @@ -2630,19 +2768,19 @@ extension RegexTests { diagnosticTest(")))", .unbalancedEndOfGroup) diagnosticTest("())()", .unbalancedEndOfGroup) - diagnosticTest("[", .expectedCustomCharacterClassMembers) - diagnosticTest("[^", .expectedCustomCharacterClassMembers) + diagnosticTest("[", .expectedCustomCharacterClassMembers, .expected("]")) + diagnosticTest("[^", .expectedCustomCharacterClassMembers, .expected("]")) diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) - diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a")) - diagnosticTest(#"\p{a="#, .emptyProperty) + diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a"), .expected("}")) + diagnosticTest(#"\p{a="#, .emptyProperty, .expected("}")) diagnosticTest(#"\p{a=}"#, .emptyProperty) - diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) - diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) - diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b"), .expected("}")) + diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa"), .expected("}")) + diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b"), .expected("}")) diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category")) @@ -2660,30 +2798,30 @@ extension RegexTests { diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) - diagnosticTest(#"(?"#, .expectedGroupSpecifier) + diagnosticTest(#"(?"#, .expectedGroupSpecifier, .expected(")")) diagnosticTest(#"(?^"#, .expected(")")) diagnosticTest(#"(?^i"#, .expected(")")) - diagnosticTest(#"(?y)"#, .expected("{")) - diagnosticTest(#"(?y{)"#, .expected("g")) - diagnosticTest(#"(?y{g)"#, .expected("}")) - diagnosticTest(#"(?y{x})"#, .expected("g")) + diagnosticTest(#"(?y)"#, .expected("{"), unsupported: true) + diagnosticTest(#"(?y{)"#, .unknownTextSegmentMatchingOption(")"), .expected("}"), .expected(")"), unsupported: true) + diagnosticTest(#"(?y{g)"#, .expected("}"), unsupported: true) + diagnosticTest(#"(?y{x})"#, .unknownTextSegmentMatchingOption("x"), unsupported: true) diagnosticTest(#"(?P"#, .expected(")")) - diagnosticTest(#"(?R"#, .expected(")")) + diagnosticTest(#"(?R"#, .expected(")"), unsupported: true) diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) - diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) + diagnosticTest("\"ab\\", .expectedEscape, .expected("\""), syntax: .experimental) - diagnosticTest("(?C", .expected(")")) + diagnosticTest("(?C", .expected(")"), unsupported: true) - diagnosticTest("(?<", .expectedIdentifier(.groupName)) - diagnosticTest("(?")) - diagnosticTest("(?")) - diagnosticTest("(?", .expected(")")) + diagnosticTest("(?<", .expectedIdentifier(.groupName), .expected(">"), .expected(")")) + diagnosticTest("(?"), .expected(")")) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?"), .expected(")"), unsupported: true) + diagnosticTest("(?", .expected(")"), unsupported: true) // MARK: Character classes @@ -2724,17 +2862,17 @@ extension RegexTests { diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction) diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction) - diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction) + diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction, .invalidCharacterClassRangeOperand) diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction) diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand) diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction) diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction) - diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers, .unsupportedDotNetSubtraction) diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers) diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers) - diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers) + diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers, .unsupportedDotNetSubtraction) diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand) diagnosticTest("[--]", .expectedCustomCharacterClassMembers) @@ -2753,8 +2891,8 @@ extension RegexTests { diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) - diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers) - diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[#]", .expectedCustomCharacterClassMembers, .expected("]")) + diagnosticTest("(?x)[ # abc]", .expectedCustomCharacterClassMembers, .expected("]")) // MARK: Bad escapes @@ -2803,7 +2941,7 @@ extension RegexTests { // MARK: Confusable characters diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}")) - diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}")) + diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}"), .unbalancedEndOfGroup) diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}")) diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#)) diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}")) @@ -2819,7 +2957,7 @@ extension RegexTests { diagnosticTest("<{)}>", .unsupported("interpolation")) diagnosticTest("<{}}>", .unsupported("interpolation")) diagnosticTest("<{<{}>", .unsupported("interpolation")) - diagnosticTest("(<{)}>", .unsupported("interpolation")) + diagnosticTest("(<{)}>", .expected(")"), .unsupported("interpolation")) // MARK: Character properties @@ -2828,7 +2966,7 @@ extension RegexTests { diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) - diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa"), .expected("}"), .unknownProperty(key: nil, value: "b")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) diagnosticTest(#"\p{Basic_Latin}"#, .unknownProperty(key: nil, value: "Basic_Latin")) @@ -2840,16 +2978,15 @@ extension RegexTests { // MARK: Matching options - diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) - diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) + diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions, unsupported: true) + diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions, unsupported: true) - // FIXME: Reenable once we figure out (?X) and (?u) semantics - //diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) - //diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) - diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + // FIXME: We need to figure out (?X) and (?u) semantics + diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions, unsupported: true) + diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions, unsupported: true) + diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions, unsupported: true) diagnosticTest("(?a)", .unknownGroupKind("?a")) - diagnosticTest("(?y{)", .expected("g")) // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" @@ -2943,7 +3080,7 @@ extension RegexTests { // MARK: Group specifiers - diagnosticTest(#"(*"#, .unknownGroupKind("*")) + diagnosticTest(#"(*"#, .expectedIdentifier(.onigurumaCalloutName), .expected(")"), unsupported: true) diagnosticTest(#"(?k)"#, .unknownGroupKind("?k")) diagnosticTest(#"(?P#)"#, .invalidMatchingOption("#")) @@ -2957,9 +3094,9 @@ extension RegexTests { diagnosticTest("(?'🔥')", .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?'a-b-c')"#, .expected("'")) + diagnosticTest(#"(?'-')"#, .expectedIdentifier(.groupName), unsupported: true) + diagnosticTest(#"(?'--')"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?'a-b-c')"#, .expected("'"), unsupported: true) diagnosticTest("(?x)(? : )", .unknownGroupKind("? ")) @@ -2993,7 +3130,7 @@ extension RegexTests { diagnosticTest(#"$?"#, .notQuantifiable) diagnosticTest(#"(?=a)+"#, .notQuantifiable) diagnosticTest(#"(?i)*"#, .notQuantifiable) - diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#)) + diagnosticTest(#"\K{1}"#, .unsupported(#"'\K'"#), .notQuantifiable) diagnosticTest(#"\y{2,5}"#, .notQuantifiable) diagnosticTest(#"\Y{3,}"#, .notQuantifiable) @@ -3001,8 +3138,8 @@ extension RegexTests { diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) - diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex)) - diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex)) + diagnosticTest(#"\u{"#, .expectedNumber("", kind: .hex), .expected("}")) + diagnosticTest(#"\u{ "#, .expectedNumber("", kind: .hex), .expected("}")) diagnosticTest(#"\u{}"#, .expectedNumber("", kind: .hex)) diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) diagnosticTest(#"\u{ }"#, .expectedNumber("", kind: .hex)) @@ -3010,7 +3147,7 @@ extension RegexTests { diagnosticTest(#"\u{G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ GH }"#, .expectedNumber("GH", kind: .hex)) - diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex)) + diagnosticTest(#"\u{ G H }"#, .expectedNumber("G", kind: .hex), .expectedNumber("H", kind: .hex)) diagnosticTest(#"\u{ ABC G }"#, .expectedNumber("G", kind: .hex)) diagnosticTest(#"\u{ FFFFFFFFF A }"#, .numberOverflow("FFFFFFFFF")) @@ -3024,38 +3161,38 @@ extension RegexTests { // MARK: Matching options - diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret, .expected(")")) diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) - diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret, .expected(")")) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References diagnosticTest(#"\k''"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?&)"#, .expectedIdentifier(.groupName)) - diagnosticTest(#"(?P>)"#, .expectedIdentifier(.groupName)) + diagnosticTest(#"(?&)"#, .expectedIdentifier(.groupName), unsupported: true) + diagnosticTest(#"(?P>)"#, .expectedIdentifier(.groupName), unsupported: true) diagnosticTest(#"\g{0}"#, .cannotReferToWholePattern) - diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern) + diagnosticTest(#"(?(0))"#, .cannotReferToWholePattern, unsupported: true) - diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName)) - diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName)) + diagnosticTest(#"(?&&)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?&-1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?P>+1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?P=+1)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"\k'#'"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) + diagnosticTest(#"(?&#)"#, .identifierMustBeAlphaNumeric(.groupName), unsupported: true) - diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName)) - diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName)) + diagnosticTest(#"(?P>1)"#, .identifierCannotStartWithNumber(.groupName), unsupported: true) + diagnosticTest(#"\k{1}"#, .identifierCannotStartWithNumber(.groupName), .invalidNamedReference("1")) - diagnosticTest(#"\g<1-1>"#, .expected(">")) - diagnosticTest(#"\g{1-1}"#, .expected("}")) - diagnosticTest(#"\k{a-1}"#, .expected("}")) - diagnosticTest(#"\k{a-}"#, .expected("}")) + diagnosticTest(#"\g<1-1>"#, .expected(">"), unsupported: true) + diagnosticTest(#"\g{1-1}"#, .expected("}"), .invalidReference(1)) + diagnosticTest(#"\k{a-1}"#, .expected("}"), .invalidNamedReference("a")) + diagnosticTest(#"\k{a-}"#, .expected("}"), .invalidNamedReference("a")) - diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal)) - diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal)) + diagnosticTest(#"\k"#, .expectedNumber("", kind: .decimal), .invalidNamedReference("a")) + diagnosticTest(#"\k<1+>"#, .expectedNumber("", kind: .decimal), .invalidReference(1)) diagnosticTest(#"()\k<1+1>"#, .unsupported("recursion level")) diagnosticTest(#"()\k<1-1>"#, .unsupported("recursion level")) @@ -3075,65 +3212,67 @@ extension RegexTests { // MARK: Conditionals - diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) + diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3), unsupported: true) + diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3), unsupported: true) diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts // PCRE callouts - diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)")) - diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)")) + diagnosticTest("(?C-1)", .unknownCalloutKind("(?C-1)"), unsupported: true) + diagnosticTest("(?C-1", .unknownCalloutKind("(?C-1)"), .expected(")"), unsupported: true) // Oniguruma named callouts - diagnosticTest("(*bar[", .expectedIdentifier(.onigurumaCalloutTag)) - diagnosticTest("(*bar[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) - diagnosticTest("(*bar{", .expectedCalloutArgument) - diagnosticTest("(*bar}", .expected(")")) - diagnosticTest("(*bar]", .expected(")")) + diagnosticTest("(*bar[", .expectedIdentifier(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(*bar[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(*bar{", .expectedCalloutArgument, .expected("}"), .expected(")"), unsupported: true) + diagnosticTest("(*bar}", .expected(")"), unsupported: true) + diagnosticTest("(*bar]", .expected(")"), unsupported: true) // Oniguruma 'of contents' callouts - diagnosticTest("(?{", .expected("}")) - diagnosticTest("(?{}", .expectedNonEmptyContents) - diagnosticTest("(?{x}", .expected(")")) - diagnosticTest("(?{x}}", .expected(")")) - diagnosticTest("(?{{x}}", .expected(")")) - diagnosticTest("(?{{x}", .expected("}")) - diagnosticTest("(?{x}[", .expectedIdentifier(.onigurumaCalloutTag)) - diagnosticTest("(?{x}[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag)) - diagnosticTest("(?{x}[a]", .expected(")")) - diagnosticTest("(?{x}[a]K", .expected(")")) - diagnosticTest("(?{x}[a]X", .expected(")")) - diagnosticTest("(?{{x}y}", .expected("}")) + diagnosticTest("(?{", .expected("}"), .expectedNonEmptyContents, .expected(")"), unsupported: true) + diagnosticTest("(?{}", .expectedNonEmptyContents, .expected(")"), unsupported: true) + diagnosticTest("(?{x}", .expected(")"), unsupported: true) + diagnosticTest("(?{x}}", .expected(")"), unsupported: true) + diagnosticTest("(?{{x}}", .expected(")"), unsupported: true) + + // TODO: We shouldn't be emitting both 'expected }' and 'expected }}' here. + diagnosticTest("(?{{x}", .expected("}"), .expected("}}"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[", .expectedIdentifier(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[%", .identifierMustBeAlphaNumeric(.onigurumaCalloutTag), .expected("]"), .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]", .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]K", .expected(")"), unsupported: true) + diagnosticTest("(?{x}[a]X", .expected(")"), unsupported: true) + diagnosticTest("(?{{x}y}", .expected("}"), .expected("}}"), .expected(")"), unsupported: true) // MARK: Backtracking directives - diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK")) - diagnosticTest("(*:)", .expectedNonEmptyContents) - diagnosticTest("(*MARK:a)?", .unsupported("backtracking directive")) - diagnosticTest("(*FAIL)+", .unsupported("backtracking directive")) - diagnosticTest("(*COMMIT:b)*", .unsupported("backtracking directive")) - diagnosticTest("(*PRUNE:a)??", .unsupported("backtracking directive")) - diagnosticTest("(*SKIP:a)*?", .unsupported("backtracking directive")) - diagnosticTest("(*F)+?", .unsupported("backtracking directive")) - diagnosticTest("(*:a){2}", .unsupported("backtracking directive")) + diagnosticTest("(*MARK)", .backtrackingDirectiveMustHaveName("MARK"), unsupported: true) + diagnosticTest("(*:)", .expectedNonEmptyContents, unsupported: true) + diagnosticTest("(*MARK:a)?", .notQuantifiable, unsupported: true) + diagnosticTest("(*FAIL)+", .notQuantifiable, unsupported: true) + diagnosticTest("(*COMMIT:b)*", .notQuantifiable, unsupported: true) + diagnosticTest("(*PRUNE:a)??", .notQuantifiable, unsupported: true) + diagnosticTest("(*SKIP:a)*?", .notQuantifiable, unsupported: true) + diagnosticTest("(*F)+?", .notQuantifiable, unsupported: true) + diagnosticTest("(*:a){2}", .notQuantifiable, unsupported: true) // MARK: Oniguruma absent functions - diagnosticTest("(?~", .expected(")")) - diagnosticTest("(?~|", .expected(")")) - diagnosticTest("(?~|a|b|c)", .tooManyAbsentExpressionChildren(3)) - diagnosticTest("(?~||||)", .tooManyAbsentExpressionChildren(4)) + diagnosticTest("(?~", .expected(")"), unsupported: true) + diagnosticTest("(?~|", .expected(")"), unsupported: true) + diagnosticTest("(?~|a|b|c)", .tooManyAbsentExpressionChildren(3), unsupported: true) + diagnosticTest("(?~||||)", .tooManyAbsentExpressionChildren(4), unsupported: true) // MARK: Global matching options diagnosticTest("a(*CR)", .globalMatchingOptionNotAtStart("(*CR)")) - diagnosticTest("(*CR)a(*LF)", .globalMatchingOptionNotAtStart("(*LF)")) - diagnosticTest("(*LIMIT_HEAP)", .expected("=")) - diagnosticTest("(*LIMIT_DEPTH=", .expectedNumber("", kind: .decimal)) + diagnosticTest("(*CR)a(*LF)", .globalMatchingOptionNotAtStart("(*LF)"), unsupported: true) + diagnosticTest("(*LIMIT_HEAP)", .expected("="), .expectedNumber("", kind: .decimal), unsupported: true) + diagnosticTest("(*LIMIT_DEPTH=", .expectedNumber("", kind: .decimal), .expected(")"), unsupported: true) // TODO: This diagnostic could be better. - diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal)) + diagnosticTest("(*LIMIT_DEPTH=-1", .expectedNumber("", kind: .decimal), .expected(")"), unsupported: true) } func testDelimiterLexingErrors() { @@ -3179,4 +3318,41 @@ extension RegexTests { compilerInterfaceDiagnosticMessageTest( #"#/\u{}/#"#, "cannot parse regular expression: expected hexadecimal number") } + + func testParserFatalError() { + do { + var p = Parser(Source(""), syntax: .traditional) + p.advance() + try p.parse().ensureValid() + XCTFail("Expected unreachable") + } catch let err { + if !"\(err)".hasPrefix("UNREACHABLE") { + XCTFail("Expected unreachable \(err)") + } + } + + // Make sure fatal errors are preserved through lookaheads and backtracks. + do { + var p = Parser(Source(""), syntax: .traditional) + p.lookahead { p in + p.tryEating { p -> Void? in + p.lookahead { p in + p.advance() + p.lookahead { _ in } + p.tryEating { _ in } + } + return nil + } + } + if p.diags.diags.count != 1 { + XCTFail("Expected single fatal diagnostic") + } + try p.diags.throwAnyError() + XCTFail("Expected unreachable") + } catch let err { + if !"\(err)".hasPrefix("UNREACHABLE") { + XCTFail("Expected unreachable \(err)") + } + } + } } diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 7bf8ba412..97ba3e333 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -21,7 +21,7 @@ func testConversion( _ expectedDSL: String, file: StaticString = #file, line: UInt = #line ) throws { - let ast = try _RegexParser.parse(regex, .semantic, .traditional) + let ast = try _RegexParser.parse(regex, .traditional) let actualDSL = renderAsBuilderDSL(ast: ast)._trimmingSuffix(while: \.isWhitespace) XCTAssertEqual(actualDSL, expectedDSL[...], file: file, line: line) }