diff --git a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md index f3dda59ba..7dd56b6a8 100644 --- a/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md +++ b/Documentation/Evolution/RegexSyntaxRunTimeConstruction.md @@ -339,7 +339,7 @@ UnicodeScalar -> '\u{' HexDigit{1...} '}' | '\o{' OctalDigit{1...} '}' | '\0' OctalDigit{0...3} -HexDigit -> [0-9a-zA-Z] +HexDigit -> [0-9a-fA-F] OctalDigit -> [0-7] NamedScalar -> '\N{' ScalarName '}' diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 0aa0951c5..1f6043d72 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -72,6 +72,9 @@ extension AST { // (*ACCEPT), (*FAIL), ... case backtrackingDirective(BacktrackingDirective) + + // (?i), (?i-m), ... + case changeMatchingOptions(MatchingOptionSequence) } } } @@ -91,6 +94,7 @@ extension AST.Atom { case .subpattern(let v): return v case .callout(let v): return v case .backtrackingDirective(let v): return v + case .changeMatchingOptions(let v): return v case .any: return nil case .startOfLine: return nil case .endOfLine: return nil @@ -397,9 +401,6 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) case onigurumaSpecial(OnigurumaSpecialProperty) - - /// Unhandled properties. - case other(key: String?, value: String) } // TODO: erm, separate out or fold into something? splat it in? @@ -631,6 +632,41 @@ extension AST.Atom { } } +extension AST.Atom.EscapedBuiltin { + /// If the escape sequence represents a unicode scalar value, returns the + /// value, otherwise `nil`. + public var scalarValue: UnicodeScalar? { + switch self { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + } +} + extension AST.Atom { /// Retrieve the character value of the atom if it represents a literal /// character or unicode scalar, nil otherwise. @@ -642,34 +678,7 @@ extension AST.Atom { return Character(s) case .escaped(let c): - switch c { - // TODO: Should we separate these into a separate enum? Or move the - // specifics of the scalar to the DSL tree? - case .alarm: - return "\u{7}" - case .backspace: - return "\u{8}" - case .escape: - return "\u{1B}" - case .formfeed: - return "\u{C}" - case .newline: - return "\n" - case .carriageReturn: - return "\r" - case .tab: - return "\t" - - case .singleDataUnit, .decimalDigit, .notDecimalDigit, - .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, - .newlineSequence, .whitespace, .notWhitespace, .verticalTab, - .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, - .wordBoundary, .notWordBoundary, .startOfSubject, - .endOfSubjectBeforeNewline, .endOfSubject, - .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, - .textSegment, .notTextSegment: - return nil - } + return c.scalarValue.map(Character.init) case .keyboardControl, .keyboardMeta, .keyboardMetaControl: // TODO: These should have unicode scalar values. @@ -683,7 +692,7 @@ extension AST.Atom { return nil case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective: + .callout, .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -723,7 +732,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -732,6 +741,8 @@ extension AST.Atom { switch kind { case .backtrackingDirective(let b): return b.isQuantifiable + case .changeMatchingOptions: + return false // TODO: Are callouts quantifiable? default: return true diff --git a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift index 614048f0a..19e72aef5 100644 --- a/Sources/_RegexParser/Regex/AST/CustomCharClass.swift +++ b/Sources/_RegexParser/Regex/AST/CustomCharClass.swift @@ -97,6 +97,10 @@ extension CustomCC.Member { if case .trivia = self { return true } return false } + + public var isSemantic: Bool { + !isTrivia + } } extension AST.CustomCharacterClass { @@ -104,7 +108,7 @@ extension AST.CustomCharacterClass { /// nested custom character classes. public var strippingTriviaShallow: Self { var copy = self - copy.members = copy.members.filter { !$0.isTrivia } + copy.members = copy.members.filter(\.isSemantic) return copy } } diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 81e0931ad..a8c4f8b0f 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -68,9 +68,7 @@ extension AST { case atomicScriptRun // (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:) - // Isolated options are written as e.g (?i), and implicitly form a group - // containing all the following elements of the current group. - case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool) + case changeMatchingOptions(MatchingOptionSequence) // NOTE: Comments appear to be groups, but are not parsed // the same. They parse more like quotes, so are not @@ -87,21 +85,6 @@ extension AST.Group.Kind { } } - /// Whether this is a group with an implicit scope, e.g isolated matching - /// options implicitly become parent groups for the rest of the elements in - /// the current group: - /// - /// (a(?i)bc)de -> (a(?i:bc))de - /// - public var hasImplicitScope: Bool { - switch self { - case .changeMatchingOptions(_, let isIsolated): - return isIsolated - default: - return false - } - } - /// If this is a named group, its name, `nil` otherwise. public var name: String? { switch self { diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index e5b65a46c..911312121 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -397,8 +397,9 @@ extension Source { return .pcreSpecial(pcreSpecial) } - // Otherwise we don't know what this is. - return .other(key: nil, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: nil, value: value) } static func classifyCharacterProperty( @@ -435,6 +436,8 @@ extension Source { if let match = match { return match } - return .other(key: key, value: value) + // TODO: This should be versioned, and do we want a more lax behavior for + // the runtime? + throw ParseError.unknownProperty(key: key, value: value) } } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 621d6ea11..c3d74c30b 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -57,8 +57,8 @@ enum ParseError: Error, Hashable { case expectedCustomCharacterClassMembers case invalidCharacterClassRangeOperand - case invalidPOSIXSetName(String) case emptyProperty + case unknownProperty(key: String?, value: String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -142,10 +142,13 @@ extension ParseError: CustomStringConvertible { return "expected custom character class members" case .invalidCharacterClassRangeOperand: return "invalid character class range" - case let .invalidPOSIXSetName(n): - return "invalid character set name: '\(n)'" case .emptyProperty: return "empty property" + case .unknownProperty(let key, let value): + if let key = key { + return "unknown character property '\(key)=\(value)'" + } + return "unknown character property '\(value)'" case .expectedGroupSpecifier: return "expected group specifier" case .unbalancedEndOfGroup: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index c48d53de9..6a61ccdf7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -21,6 +21,16 @@ API convention: - eat() and tryEat() is still used by the parser as a character-by-character interface */ +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is _LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} + extension Source { // MARK: - recordLoc @@ -51,12 +61,8 @@ extension Source { do { guard let result = try f(&self) else { return nil } return Located(result, start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start.. AST.MatchingOptionSequence? { + try tryEating { src in + guard src.tryEat(sequence: "(?"), + let seq = try src.lexMatchingOptionSequence(context: context) + else { return nil } + try src.expect(")") + return seq + } + } + /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { tryEating { src in @@ -846,7 +868,7 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom() else { return nil } + guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } guard src.tryEat("(") else { return nil } if src.tryEat("?") { @@ -871,22 +893,13 @@ extension Source { // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). if let seq = try src.lexMatchingOptionSequence(context: context) { - if src.tryEat(":") { - return .changeMatchingOptions(seq, isIsolated: false) - } - // If this isn't start of an explicit group, we should have an - // implicit group that covers the remaining elements of the current - // group. - // TODO: This implicit scoping behavior matches Oniguruma, but PCRE - // also does it across alternations, which will require additional - // handling. - guard src.tryEat(")") else { + guard src.tryEat(":") else { if let next = src.peek() { throw ParseError.invalidMatchingOption(next) } throw ParseError.expected(")") } - return .changeMatchingOptions(seq, isIsolated: true) + return .changeMatchingOptions(seq) } guard let next = src.peek() else { @@ -1035,18 +1048,8 @@ extension Source { context: ParsingContext ) throws -> Located? { try tryEating { src in - guard src.tryEat(sequence: "(?"), - let group = try src.lexGroupStart(context: context) - else { return nil } - - // Implicitly scoped groups are not supported here. - guard !group.value.hasImplicitScope else { - throw LocatedError( - ParseError.unsupportedCondition("implicitly scoped group"), - group.location - ) - } - return group + guard src.tryEat(sequence: "(?") else { return nil } + return try src.lexGroupStart(context: context) } } @@ -1066,9 +1069,13 @@ extension Source { mutating func lexCustomCCStart( ) throws -> Located? { recordLoc { src in - // POSIX named sets are atoms. - guard !src.starts(with: "[:") else { return nil } - + // Make sure we don't have a POSIX character property. This may require + // walking to its ending to make sure we have a closing ':]', as otherwise + // we have a custom character class. + // TODO: This behavior seems subtle, could we warn? + guard !src.canLexPOSIXCharacterProperty() else { + return nil + } if src.tryEat("[") { return src.tryEat("^") ? .inverted : .normal } @@ -1101,10 +1108,33 @@ extension Source { private mutating func lexPOSIXCharacterProperty( ) throws -> Located? { try recordLoc { src in - guard src.tryEat(sequence: "[:") else { return nil } - let inverted = src.tryEat("^") - let prop = try src.lexCharacterPropertyContents(end: ":]").value - return .init(prop, isInverted: inverted, isPOSIX: true) + try src.tryEating { src in + guard src.tryEat(sequence: "[:") else { return nil } + let inverted = src.tryEat("^") + + // Note we lex the contents and ending *before* classifying, because we + // want to bail with nil if we don't have the right ending. This allows + // the lexing of a custom character class if we don't have a ':]' + // ending. + let (key, value) = src.lexCharacterPropertyKeyValue() + guard src.tryEat(sequence: ":]") else { return nil } + + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + return .init(prop, isInverted: inverted, isPOSIX: true) + } + } + } + + private func canLexPOSIXCharacterProperty() -> Bool { + do { + var src = self + return try src.lexPOSIXCharacterProperty() != nil + } catch { + // We want to tend on the side of lexing a POSIX character property, so + // even if it is invalid in some way (e.g invalid property names), still + // try and lex it. + return true } } @@ -1129,26 +1159,60 @@ extension Source { } } - private mutating func lexCharacterPropertyContents( - end: String - ) throws -> Located { - try recordLoc { src in - // We should either have: - // - 'x=y' where 'x' is a property key, and 'y' is a value. - // - 'y' where 'y' is a value (or a bool key with an inferred value - // of true), and its key is inferred. - // TODO: We could have better recovery here if we only ate the characters - // that property keys and values can use. - let lhs = src.lexUntil { - $0.isEmpty || $0.peek() == "=" || $0.starts(with: end) - }.value - if src.tryEat("=") { - let rhs = try src.lexUntil(eating: end).value - return try Source.classifyCharacterProperty(key: lhs, value: rhs) + private mutating func lexCharacterPropertyKeyValue( + ) -> (key: String?, value: String) { + func atPossibleEnding(_ src: inout Source) -> Bool { + guard let next = src.peek() else { return true } + switch next { + case "=": + // End of a key. + return true + case ":", "[", "]": + // POSIX character property endings to cover ':]', ']', and '[' as the + // start of a nested character class. + return true + case "}": + // Ending of '\p{'. We cover this for POSIX too as it's not a valid + // character property name anyway, and it's nice not to have diverging + // logic for these cases. + return true + case "\\": + // An escape sequence, which may include e.g '\Q :] \E'. ICU bails here + // for all its known escape sequences (e.g '\a', '\e' '\f', ...). It + // seems character class escapes e.g '\d' are excluded, however it's not + // clear that is intentional. Let's apply the rule for any escape, as a + // backslash would never be a valid character property name, and we can + // diagnose any invalid escapes when parsing as a character class. + return true + default: + // We may want to handle other metacharacters here, e.g '{', '(', ')', + // as they're not valid character property names. However for now + // let's tend on the side of forming an unknown property name in case + // these characters are ever used in future character property names + // (though it's very unlikely). Users can always escape e.g the ':' + // in '[:' if they definitely want a custom character class. + return false } - try src.expect(sequence: end) - return try Source.classifyCharacterPropertyValueOnly(lhs) } + // We should either have: + // - 'x=y' where 'x' is a property key, and 'y' is a value. + // - 'y' where 'y' is a value (or a bool key with an inferred value of true) + // and its key is inferred. + let lhs = lexUntil(atPossibleEnding).value + if tryEat("=") { + let rhs = lexUntil(atPossibleEnding).value + return (lhs, rhs) + } + return (nil, lhs) + } + + private static func classifyCharacterPropertyContents( + key: String?, value: String + ) throws -> AST.Atom.CharacterProperty.Kind { + if let key = key { + return try classifyCharacterProperty(key: key, value: value) + } + return try classifyCharacterPropertyValueOnly(value) } /// Try to consume a character property. @@ -1164,7 +1228,10 @@ extension Source { let isInverted = src.peek() == "P" src.advance(2) - let prop = try src.lexCharacterPropertyContents(end: "}").value + let (key, value) = src.lexCharacterPropertyKeyValue() + let prop = try Source.classifyCharacterPropertyContents(key: key, + value: value) + try src.expect("}") return .init(prop, isInverted: isInverted, isPOSIX: false) } } @@ -1177,17 +1244,19 @@ extension Source { allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false ) throws -> AST.Reference? { let kind = try recordLoc { src -> AST.Reference.Kind? in - // Note this logic should match canLexNumberedReference. - if src.tryEat("+") { - return .relative(try src.expectNumber().value) - } - if src.tryEat("-") { - return .relative(try -src.expectNumber().value) - } - if let num = try src.lexNumber() { - return .absolute(num.value) + try src.tryEating { src in + // Note this logic should match canLexNumberedReference. + if src.tryEat("+"), let num = try src.lexNumber() { + return .relative(num.value) + } + if src.tryEat("-"), let num = try src.lexNumber() { + return .relative(-num.value) + } + if let num = try src.lexNumber() { + return .absolute(num.value) + } + return nil } - return nil } guard let kind = kind else { return nil } guard allowWholePatternRef || kind.value != .recurseWholePattern else { @@ -1416,8 +1485,21 @@ extension Source { return src.canLexNumberedReference() } + private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { + var src = self + + // See if we can lex a matching option sequence that terminates in ')'. Such + // a sequence is an atom. If an error is thrown, there are invalid elements + // of the matching option sequence. In such a case, we can lex as a group + // and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") + } + /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom() -> Bool { + private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { var src = self guard src.tryEat("(") else { return false } @@ -1431,6 +1513,9 @@ extension Source { // The start of an Oniguruma 'of-contents' callout. if src.tryEat("{") { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + return false } // The start of a backreference directive or Oniguruma named callout. @@ -1691,13 +1776,20 @@ extension Source { /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind { + mutating func expectGroupLikeAtom( + context: ParsingContext + ) throws -> AST.Atom.Kind { try recordLoc { src in // References that look like groups, e.g (?R), (?1), ... if let ref = try src.lexGroupLikeReference() { return ref.value } + // Change matching options atom (?i), (?x-i), ... + if let seq = try src.lexChangeMatchingOptionAtom(context: context) { + return .changeMatchingOptions(seq) + } + // (*ACCEPT), (*FAIL), (*MARK), ... if let b = try src.lexBacktrackingDirective() { return .backtrackingDirective(b) @@ -1758,18 +1850,16 @@ extension Source { if !customCC && (src.peek() == ")" || src.peek() == "|") { return nil } // TODO: Store customCC in the atom, if that's useful - // POSIX character property. This is only allowed in a custom character - // class. - // TODO: Can we try and recover and diagnose these outside character - // classes? - if customCC, let prop = try src.lexPOSIXCharacterProperty()?.value { + // POSIX character property. Like \p{...} this is also allowed outside of + // a custom character class. + if let prop = try src.lexPOSIXCharacterProperty()?.value { return .property(prop) } // If we have group syntax that was skipped over in lexGroupStart, we // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom() { - return try src.expectGroupLikeAtom() + if !customCC && src.shouldLexGroupLikeAtom(context: context) { + return try src.expectGroupLikeAtom(context: context) } // A quantifier here is invalid. @@ -1787,6 +1877,9 @@ extension Source { } throw Unreachable("TODO: reason") + case "(" where !customCC: + throw Unreachable("Should have lexed a group or group-like atom") + // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .any case "^": return customCC ? .char("^") : .startOfLine diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index c3aa3500b..975012546 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -282,42 +282,53 @@ extension Parser { loc(start))) } + /// Apply the syntax options of a given matching option sequence to the + /// current set of options. + private mutating func applySyntaxOptions( + of opts: AST.MatchingOptionSequence + ) { + // We skip this for multi-line, as extended syntax is always enabled there. + if context.syntax.contains(.multilineExtendedSyntax) { return } + + // Check if we're introducing or removing extended syntax. + // TODO: PCRE differentiates between (?x) and (?xx) where only the latter + // handles non-semantic whitespace in a custom character class. Other + // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, + // treat (?x) and (?xx) as the same option here. If we ever get a strict + // PCRE mode, we will need to change this to handle that. + if opts.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if opts.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if opts.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } + } + + /// Apply the syntax options of a matching option changing group to the + /// current set of options. + private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + if case .changeMatchingOptions(let seq) = group { + applySyntaxOptions(of: seq) + } + } + /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. We skip this for - // multi-line, as extended syntax is always enabled there. - // TODO: PCRE differentiates between (?x) and (?xx) where only the latter - // handles non-semantic whitespace in a custom character class. Other - // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, - // treat (?x) and (?xx) as the same option here. If we ever get a strict - // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if !context.syntax.contains(.multilineExtendedSyntax) { - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) - } - } - } + applySyntaxOptions(of: kind.value) defer { context.syntax = currentSyntax } let child = try parseNode() - // An implicit scoped group has already consumed its closing paren. - if !kind.value.hasImplicitScope { - try source.expect(")") - } + try source.expect(")") return .init(kind, child, loc(start)) } @@ -409,6 +420,11 @@ extension Parser { } if let atom = try source.lexAtom(context: context) { + // If we have a change matching options atom, apply the syntax options. We + // already take care of scoping syntax options within a group. + if case .changeMatchingOptions(let opts) = atom.kind { + applySyntaxOptions(of: opts) + } // TODO: track source locations return .atom(atom) } @@ -438,16 +454,18 @@ extension Parser { defer { context.isInCustomCharacterClass = alreadyInCCC } typealias Member = CustomCC.Member - try source.expectNonEmpty() - var members: Array = [] + try parseCCCMembers(into: &members) - // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty - // character classes, and assume an initial ']' is literal. - if let loc = source.tryEatWithLoc("]") { - members.append(.atom(.init(.char("]"), loc))) + // If we didn't parse any semantic members, we can eat a ']' character, as + // PCRE, Oniguruma, and ICU forbid empty character classes, and assume an + // initial ']' is literal. + if members.none(\.isSemantic) { + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + try parseCCCMembers(into: &members) + } } - try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note // that this means we left associate for a chain of operators. @@ -458,8 +476,9 @@ extension Parser { var rhs: Array = [] try parseCCCMembers(into: &rhs) - if members.isEmpty || rhs.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) || rhs.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } // If we're done, bail early @@ -472,8 +491,9 @@ extension Parser { // Otherwise it's just another member to accumulate members = [setOp] } - if members.isEmpty { - throw ParseError.expectedCustomCharacterClassMembers + if members.none(\.isSemantic) { + throw Source.LocatedError( + ParseError.expectedCustomCharacterClassMembers, start.location) } try source.expect("]") return CustomCC(start, members, loc(start.location.start)) @@ -484,7 +504,8 @@ extension Parser { ) throws { // Parse members until we see the end of the custom char class or an // operator. - while source.peek() != "]" && source.peekCCBinOp() == nil { + while !source.isEmpty && source.peek() != "]" && + source.peekCCBinOp() == nil { // Nested custom character class. if let cccStart = try source.lexCustomCCStart() { diff --git a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift index 0a1fd2136..96562d15c 100644 --- a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift +++ b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift @@ -56,9 +56,11 @@ extension Source { var currentPosition: Position { bounds.lowerBound } } +protocol _LocatedErrorProtocol: Error {} + extension Source { /// An error with source location info - public struct LocatedError: Error { + public struct LocatedError: Error, _LocatedErrorProtocol { public let error: E public let location: SourceLocation @@ -70,7 +72,6 @@ extension Source { self.error = v self.location = Location(r) } - } /// Located value: a value wrapped with a source range diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 0e40ad2ce..8565b14e9 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -156,6 +156,9 @@ extension AST.Atom { case .backtrackingDirective(let d): return "\(d)" + case .changeMatchingOptions(let opts): + return "changeMatchingOptions<\(opts)>" + case .char, .scalar: fatalError("Unreachable") } @@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable { extension AST.Group.Kind: _ASTPrintable { public var _dumpBase: String { switch self { - case .capture: return "capture" - case .namedCapture(let s): return "capture<\(s.value)>" - case .balancedCapture(let b): return "balanced capture \(b)" - case .nonCapture: return "nonCapture" - case .nonCaptureReset: return "nonCaptureReset" - case .atomicNonCapturing: return "atomicNonCapturing" - case .lookahead: return "lookahead" - case .negativeLookahead: return "negativeLookahead" - case .nonAtomicLookahead: return "nonAtomicLookahead" - case .lookbehind: return "lookbehind" - case .negativeLookbehind: return "negativeLookbehind" - case .nonAtomicLookbehind: return "nonAtomicLookbehind" - case .scriptRun: return "scriptRun" - case .atomicScriptRun: return "atomicScriptRun" - case .changeMatchingOptions(let seq, let isIsolated): - return "changeMatchingOptions<\(seq), \(isIsolated)>" + case .capture: return "capture" + case .namedCapture(let s): return "capture<\(s.value)>" + case .balancedCapture(let b): return "balanced capture \(b)" + case .nonCapture: return "nonCapture" + case .nonCaptureReset: return "nonCaptureReset" + case .atomicNonCapturing: return "atomicNonCapturing" + case .lookahead: return "lookahead" + case .negativeLookahead: return "negativeLookahead" + case .nonAtomicLookahead: return "nonAtomicLookahead" + case .lookbehind: return "lookbehind" + case .negativeLookbehind: return "negativeLookbehind" + case .nonAtomicLookbehind: return "nonAtomicLookbehind" + case .scriptRun: return "scriptRun" + case .atomicScriptRun: return "atomicScriptRun" + case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>" } } } @@ -312,7 +314,9 @@ extension AST.CustomCharacterClass.Member: _ASTPrintable { case .quote(let q): return "\(q)" case .trivia(let t): return "\(t)" case .setOperation(let lhs, let op, let rhs): - return "op \(lhs) \(op.value) \(rhs)" + // TODO: We should eventually have some way of filtering out trivia for + // tests, so that it can appear in regular dumps. + return "op \(lhs.filter(\.isSemantic)) \(op.value) \(rhs.filter(\.isSemantic))" } } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 3cc2d4039..b6f9b4732 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen { case let .symbolicReference(id): builder.buildUnresolvedReference(id: id) + case let .changeMatchingOptions(optionSequence): + options.apply(optionSequence) + case let .unconverted(astAtom): if let consumer = try astAtom.generateConsumer(options) { builder.buildConsume(by: consumer) @@ -349,7 +352,7 @@ extension Compiler.ByteCodeGen { case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") - case .changeMatchingOptions(let optionSequence, _): + case .changeMatchingOptions(let optionSequence): options.apply(optionSequence) try emitNode(child) @@ -585,6 +588,9 @@ extension Compiler.ByteCodeGen { } case let .capture(_, refId, child): + options.beginScope() + defer { options.endScope() } + let cap = builder.makeCapture(id: refId) switch child { case let .matcher(_, m): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 600234e0d..f77cd322f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -100,6 +100,10 @@ extension DSLTree.Atom { // TODO: Should we handle? return nil + case .changeMatchingOptions: + // TODO: Should we handle? + return nil + case let .unconverted(a): return try a.generateConsumer(opts) } @@ -178,7 +182,8 @@ extension AST.Atom { return nil case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective: + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: // FIXME: implement return nil } @@ -287,105 +292,7 @@ extension DSLTree.CustomCharacterClass.Member { } case .trivia: // TODO: Should probably strip this earlier... - return { _, bounds in - return bounds.lowerBound - } - } - - } -} - -extension AST.CustomCharacterClass.Member { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - switch self { - case .custom(let ccc): - return try ccc.generateConsumer(opts) - - case .range(let r): - guard let lhs = r.lhs.literalCharacterValue else { - throw Unsupported("\(r.lhs) in range") - } - guard let rhs = r.rhs.literalCharacterValue else { - throw Unsupported("\(r.rhs) in range") - } - - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil - } - - case .atom(let atom): - guard let gen = try atom.generateConsumer(opts) else { - throw Unsupported("TODO") - } - return gen - - case .quote(let q): - // TODO: Not optimal. - let consumers = try q.literal.map { - try AST.Atom(.char($0), .fake).generateConsumer(opts)! - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return idx - } - } - return nil - } - - case .trivia: - throw Unreachable( - "Should have been stripped by caller") - - case .setOperation(let lhs, let op, let rhs): - // TODO: We should probably have a component type - // instead of a members array... for now we reconstruct - // an AST node... - let start = AST.Located( - faking: AST.CustomCharacterClass.Start.normal) - - let lhs = try AST.CustomCharacterClass( - start, lhs, .fake - ).generateConsumer(opts) - let rhs = try AST.CustomCharacterClass( - start, rhs, .fake - ).generateConsumer(opts) - - return { input, bounds in - // NOTE: Easy way to implement, not performant - let lhsIdxOpt = lhs(input, bounds) - let rhsIdxOpt = rhs(input, bounds) - - // TODO: What if lengths don't line up? - assert(lhsIdxOpt == rhsIdxOpt || lhsIdxOpt == nil - || rhsIdxOpt == nil) - - switch op.value { - case .subtraction: - guard rhsIdxOpt == nil else { return nil } - return lhsIdxOpt - - case .intersection: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? nil : idx - } - return nil - - case .symmetricDifference: - if let idx = lhsIdxOpt { - return rhsIdxOpt == nil ? idx : nil - } - return rhsIdxOpt - } - } + return { _, _ in nil } } } } @@ -413,29 +320,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - func generateConsumer( - _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction { - // NOTE: Easy way to implement, obviously not performant - let consumers = try strippingTriviaShallow.members.map { - try $0.generateConsumer(opts) - } - return { input, bounds in - for consumer in consumers { - if let idx = consumer(input, bounds) { - return isInverted ? nil : idx - } - } - if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) - } - return nil - } - } -} - // NOTE: Conveniences, though not most performant private func consumeScalarScript( _ s: Unicode.Script @@ -539,10 +423,6 @@ extension AST.Atom.CharacterProperty { case .onigurumaSpecial(let s): throw Unsupported("TODO: map Oniguruma special: \(s)") - - case let .other(key, value): - throw Unsupported( - "TODO: map other \(key ?? "")=\(value)") } }() diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 3afc19836..4d135898b 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -137,6 +137,8 @@ extension PrettyPrinter { print("/* TOOD: backreferences */") case .symbolicReference: print("/* TOOD: symbolic references */") + case .changeMatchingOptions: + print("/* TODO: change matching options */") } case .trivia: @@ -320,6 +322,9 @@ extension AST.Atom { case .backtrackingDirective: return " /* TODO: backtracking directive */" + + case .changeMatchingOptions: + return "/* TODO: change matching options */" } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index e00dcd806..8acbd3b1b 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -206,10 +206,14 @@ extension AST.Atom { } switch self.kind { - case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) - case .any: return .any - case let .backreference(r): return .backreference(r) + case let .char(c): return .char(c) + case let .scalar(s): return .scalar(s) + case .any: return .any + case let .backreference(r): return .backreference(r) + case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) + + case .escaped(let c) where c.scalarValue != nil: + return .scalar(c.scalarValue!) default: return .unconverted(self) } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 7b2ecd515..51f5ea36f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -166,6 +166,8 @@ extension DSLTree { case backreference(AST.Reference) case symbolicReference(ReferenceID) + case changeMatchingOptions(AST.MatchingOptionSequence) + case unconverted(AST.Atom) } } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 6f38956e7..623589b54 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -195,7 +195,6 @@ extension RegexComponent { ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( - .changeMatchingOptions(sequence, isIsolated: false), - regex.root)) + .changeMatchingOptions(sequence), regex.root)) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index be9f61517..51d4f8bfc 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -119,9 +119,14 @@ func atomicScriptRun(_ child: AST.Node) -> AST.Node { group(.atomicScriptRun, child) } func changeMatchingOptions( - _ seq: AST.MatchingOptionSequence, isIsolated: Bool, _ child: AST.Node + _ seq: AST.MatchingOptionSequence, _ child: AST.Node ) -> AST.Node { - group(.changeMatchingOptions(seq, isIsolated: isIsolated), child) + group(.changeMatchingOptions(seq), child) +} +func changeMatchingOptions( + _ seq: AST.MatchingOptionSequence +) -> AST.Node { + atom(.changeMatchingOptions(seq)) } func matchingOptions( @@ -354,6 +359,11 @@ func prop( ) -> AST.Node { atom(.property(.init(kind, isInverted: inverted, isPOSIX: false))) } +func posixProp( + _ kind: AST.Atom.CharacterProperty.Kind, inverted: Bool = false +) -> AST.Node { + atom(.property(.init(kind, isInverted: inverted, isPOSIX: true))) +} // Raw atom constructing variant func atom_a( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index b1287fc8b..e00c77f56 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -281,6 +281,15 @@ extension RegexTests { // code point sequence firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true) + // Escape sequences that represent scalar values. + firstMatchTest(#"\a[\b]\e\f\n\r\t"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#, + input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", + match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") + + firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") // MARK: Quotes @@ -596,24 +605,20 @@ extension RegexTests { func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } - // Currently not supported in the matching engine. for s in scalar("\u{C}") ... scalar("\u{1B}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)") } for u: UnicodeScalar in ["\u{7}", "\u{8}"] { - firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)") } for s in scalar("\u{A}") ... scalar("\u{D}") { let u = UnicodeScalar(s)! - firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", - xfail: true) + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)") } - firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", - xfail: true) + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", xfail: true) @@ -1203,6 +1208,19 @@ extension RegexTests { ("CaFe", true), ("EfAc", true)) } + + func testNonSemanticWhitespace() { + firstMatchTest(#" \t "#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx) \t "#, input: " \t ", match: "\t") + + firstMatchTest(#"[ \t]+"#, input: " \t ", match: " \t ") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t ", match: "\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t\t ", match: "\t\t") + firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t") + + firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa") + firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a") + } func testASCIIClasses() { // 'D' ASCII-only digits @@ -1301,6 +1319,17 @@ extension RegexTests { firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + + // Matching option changing persists across alternations. + firstMatchTest(#"a(?s)b|c|.d"#, input: "abc", match: "ab") + firstMatchTest(#"a(?s)b|c|.d"#, input: "c", match: "c") + firstMatchTest(#"a(?s)b|c|.d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?s)(?^)b|c|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nc", match: "\nc") + firstMatchTest(#"a(?s)b|c(?-s)|(?^s).d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?:(?s).b)|.c|.d"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"a(?:(?s).b)|.c"#, input: "a\nc", match: nil) } func testOptionMethods() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c6ff3e46d..94c134853 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -460,9 +460,22 @@ extension RegexTests { parseTest("[-]", charClass("-")) - // Empty character classes are forbidden, therefore this is a character - // class of literal ']'. + // Empty character classes are forbidden, therefore these are character + // classes containing literal ']'. parseTest("[]]", charClass("]")) + parseTest("[]a]", charClass("]", "a")) + parseTest("(?x)[ ]]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] a ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]", "a") + )) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -474,6 +487,41 @@ extension RegexTests { parseTest( "[a^]", charClass("a", "^")) + // These are custom character classes, not invalid POSIX character classes. + // TODO: This behavior is subtle, we ought to warn. + parseTest("[[:space]]", charClass(charClass(":", "s", "p", "a", "c", "e"))) + parseTest("[:a]", charClass(":", "a")) + parseTest("[a:]", charClass("a", ":")) + parseTest("[:]", charClass(":")) + parseTest("[[:]]", charClass(charClass(":"))) + parseTest("[[:a=b=c:]]", charClass(charClass(":", "a", "=", "b", "=", "c", ":"))) + + parseTest(#"[[:a[b]:]]"#, charClass(charClass(":", "a", charClass("b"), ":"))) + parseTest(#"[[:a]][:]"#, concat(charClass(charClass(":", "a")), charClass(":"))) + parseTest(#"[[:a]]"#, charClass(charClass(":", "a"))) + parseTest(#"[[:}]]"#, charClass(charClass(":", "}"))) + parseTest(#"[[:{]]"#, charClass(charClass(":", "{"))) + parseTest(#"[[:}:]]"#, charClass(charClass(":", "}", ":"))) + + parseTest( + #"[:[:space:]:]"#, + charClass(":", posixProp_m(.binary(.whitespace)), ":") + ) + parseTest( + #"[:a[:space:]b:]"#, + charClass(":", "a", posixProp_m(.binary(.whitespace)), "b", ":") + ) + + // ICU parses a custom character class if it sees any of its known escape + // sequences in a POSIX character property (though it appears to exclude + // character class escapes e.g '\d'). We do so for any escape sequence as + // '\' is not a valid character property character. + parseTest(#"[:\Q:]\E]"#, charClass(":", quote_m(":]"))) + parseTest(#"[:\a:]"#, charClass(":", atom_m(.escaped(.alarm)), ":")) + parseTest(#"[:\d:]"#, charClass(":", atom_m(.escaped(.decimalDigit)), ":")) + parseTest(#"[:\\:]"#, charClass(":", "\\", ":")) + parseTest(#"[:\:]"#, charClass(":", ":")) + parseTest( #"\D\S\W"#, concat( @@ -502,6 +550,12 @@ extension RegexTests { posixProp_m(.binary(.uppercase), inverted: true), "c", "d")) + // Like ICU, we allow POSIX character properties outside of custom character + // classes. This also appears to be suggested by UTS#18. + // TODO: We should likely emit a warning. + parseTest("[:space:]", posixProp(.binary(.whitespace))) + parseTest("[:script=Greek:]", posixProp(.script(.greek))) + parseTest("[[[:space:]]]", charClass(charClass( posixProp_m(.binary(.whitespace)) ))) @@ -590,6 +644,15 @@ extension RegexTests { parseTest( "~~*", concat("~", zeroOrMore(of: "~"))) + parseTest( + "[ && ]", + charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) + ) + parseTest("(?x)[ a && b ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + )) + // MARK: Quotes parseTest( @@ -783,81 +846,67 @@ extension RegexTests { // Matching option changing groups. parseTest("(?-)", changeMatchingOptions( - matchingOptions(), isIsolated: true, empty()) - ) + matchingOptions() + )) parseTest("(?i)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive) + )) parseTest("(?m)", changeMatchingOptions( - matchingOptions(adding: .multiline), - isIsolated: true, empty()) - ) + matchingOptions(adding: .multiline) + )) parseTest("(?x)", changeMatchingOptions( - matchingOptions(adding: .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extended) + )) parseTest("(?xx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended) + )) parseTest("(?xxx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended, .extended) + )) parseTest("(?P)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: true, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps) + )) parseTest("(?-i)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(removing: .caseInsensitive) + )) parseTest("(?i-s)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive, removing: .singleLine), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive, removing: .singleLine) + )) parseTest("(?i-is)", changeMatchingOptions( matchingOptions(adding: .caseInsensitive, - removing: .caseInsensitive, .singleLine), - isIsolated: true, empty()) - ) + removing: .caseInsensitive, .singleLine) + )) parseTest("(?:)", nonCapture(empty())) parseTest("(?-:)", changeMatchingOptions( - matchingOptions(), isIsolated: false, empty()) - ) + matchingOptions(), empty() + )) parseTest("(?i:)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(adding: .caseInsensitive), empty() + )) parseTest("(?-i:)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(removing: .caseInsensitive), empty() + )) parseTest("(?P:)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: false, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps), empty() + )) parseTest("(?^)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: true, empty()) - ) + unsetMatchingOptions() + )) parseTest("(?^:)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: false, empty()) - ) + unsetMatchingOptions(), empty() + )) parseTest("(?^ims:)", changeMatchingOptions( unsetMatchingOptions(adding: .caseInsensitive, .multiline, .singleLine), - isIsolated: false, empty()) - ) + empty() + )) parseTest("(?^J:)", changeMatchingOptions( - unsetMatchingOptions(adding: .allowDuplicateGroupNames), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .allowDuplicateGroupNames), empty() + )) parseTest("(?^y{w}:)", changeMatchingOptions( - unsetMatchingOptions(adding: .textSegmentWordMode), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .textSegmentWordMode), empty() + )) let allOptions: [AST.MatchingOption.Kind] = [ .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture, @@ -868,50 +917,64 @@ extension RegexTests { .byteSemantics ] parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: true, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)) + )) parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: false, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)), empty() + )) parseTest( - "a(b(?i)c)d", concat("a", capture(concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, "c"))), "d"), + "a(b(?i)c)d", concat( + "a", + capture(concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + )), + "d" + ), captures: .atom() ) parseTest( - "(a(?i)b(c)d)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", capture("c"), "d")))), + "(a(?i)b(c)d)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + capture("c"), + "d" + )), captures: .tuple(.atom(), .atom()) ) parseTest( - "(a(?i)b(?#hello)c)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", "c")))), + "(a(?i)b(?#hello)c)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + "c" + )), captures: .atom() ) - // TODO: This is Oniguruma's behavior, but PCRE treats it as: - // ab(?i:c)|(?i:def)|(?i:gh) - // instead. We ought to have a mode to emulate that. - parseTest("ab(?i)c|def|gh", concat("a", "b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", concat("d", "e", "f"), concat("g", "h"))))) + parseTest("ab(?i)c|def|gh", alt( + concat( + "a", + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + concat("d", "e", "f"), + concat("g", "h") + )) - parseTest("(a|b(?i)c|d)", capture(alt("a", concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", "d"))))), - captures: .atom()) + parseTest("(a|b(?i)c|d)", capture(alt( + "a", + concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + "d" + )), captures: .atom()) // MARK: References @@ -1096,10 +1159,6 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) - parseTest(#"\p{Lx}"#, prop(.other(key: nil, value: "Lx"))) - parseTest(#"\p{gcL}"#, prop(.other(key: nil, value: "gcL"))) - parseTest(#"\p{x=y}"#, prop(.other(key: "x", value: "y"))) - // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter) @@ -1426,149 +1485,149 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) - parseTest("(?x) # hello", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, empty())) - parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, empty())) - parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, concat("#", "a", "b", "c"))) - parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, concat(" "))) + // MARK: Matching option changing parseTest( - "(?x) a (?^) b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat( - "a", - changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) - ) - ) + "(?x) # hello", + changeMatchingOptions(matchingOptions(adding: .extended)) ) - - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. parseTest( - "(?x)[ # abc]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("#", "a", "b", "c")) + "(?xx) # hello", + changeMatchingOptions(matchingOptions(adding: .extraExtended)) ) + parseTest("(?x) \\# abc", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "#", "a", "b", "c" + )) + parseTest("(?xx) \\ ", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), " " + )) parseTest( - "(?x)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?xx)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?x)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?-xx)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extraExtended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?xx)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^i)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .extended), - isIsolated: true, concat("d", "e", "f")))) - ) - parseTest( - "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c"))), "d", " ", "e", " ", "f") - ) - parseTest( - "(?x:a b c)# hi", concat(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: false, - concat("a", "b", "c")), "#", " ", "h", "i") + "(?x) a (?^) b", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions()), + " ", "b" + ) ) - parseTest( - "(?x-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xxx-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xx-i)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true, - concat("a", "b", "c")) + // End of line comments aren't applicable in custom char classes. + // TODO: ICU supports this. + parseTest("(?x)[ # abc]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("#", "a", "b", "c") + )) + + parseTest("(?x)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?xx)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?x)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?-xx)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extraExtended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?xx)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^i)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .extended)), + "d", "e", "f" + )) + parseTest("(?:(?x)a b c)d e f", concat( + nonCapture(concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c" + )), + "d", " ", "e", " ", "f" + )) + parseTest("(?x:a b c)# hi", concat(changeMatchingOptions( + matchingOptions(adding: .extended), + concat("a", "b", "c")), "#", " ", "h", "i") ) - // PCRE states that whitespace seperating quantifiers is permitted under - // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 - parseTest( - "(?x)a *", + parseTest("(?x-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - zeroOrMore(of: "a")) - ) - parseTest( - "(?x)a + ?", + matchingOptions(adding: .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xxx-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - oneOrMore(.reluctant, of: "a")) - ) - parseTest( - "(?x)a {2,4}", + matchingOptions(adding: .extraExtended, .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xx-i)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - quantRange(2 ... 4, of: "a")) - ) + matchingOptions(adding: .extraExtended, removing: .caseInsensitive) + ), + "a", "b", "c" + )) + + // PCRE states that whitespace seperating quantifiers is permitted under + // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 + parseTest("(?x)a *", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + zeroOrMore(of: "a") + )) + parseTest("(?x)a + ?", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + oneOrMore(.reluctant, of: "a") + )) + parseTest("(?x)a {2,4}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + quantRange(2 ... 4, of: "a") + )) // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 // TODO: We ought to warn on this, and produce a range anyway. - parseTest( - "(?x)a{1, 3}", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "{", "1", ",", "3", "}")) - ) + parseTest("(?x)a{1, 3}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "{", "1", ",", "3", "}" + )) // Test that we cover the list of whitespace characters covered by PCRE. parseTest( "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, concat("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b" + )) parseTest( "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("a", "b") + )) + + parseTest(#"(?i:)?"#, zeroOrOne(of: changeMatchingOptions( + matchingOptions(adding: .caseInsensitive), empty() + ))) // Test multi-line comment handling. parseTest( @@ -1790,10 +1849,10 @@ extension RegexTests { parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( - "#|(?-x)[a b]|#", changeMatchingOptions( - matchingOptions(removing: .extended), isIsolated: true, - charClass("a", " ", "b")) - ) + "#|(?-x)[a b]|#", concat( + changeMatchingOptions(matchingOptions(removing: .extended)), + charClass("a", " ", "b") + )) parseWithDelimitersTest("#|[[a ] b]|#", charClass(charClass("a"), "b")) // Non-semantic whitespace between quantifier characters for consistency @@ -1803,8 +1862,7 @@ extension RegexTests { // End-of-line comments aren't enabled by default in experimental syntax. parseWithDelimitersTest("#|#abc|#", concat("#", "a", "b", "c")) parseWithDelimitersTest("#|(?x)#abc|#", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - empty()) + matchingOptions(adding: .extended)) ) parseWithDelimitersTest("#|||#", alt(empty(), empty())) @@ -1860,8 +1918,7 @@ extension RegexTests { (?^) # comment /# - """, changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, empty()) + """, changeMatchingOptions(unsetMatchingOptions()) ) // (?x) has no effect. @@ -1870,8 +1927,7 @@ extension RegexTests { (?x) # comment /# - """, changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, empty()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter @@ -2178,12 +2234,19 @@ extension RegexTests { diagnosticTest(")))", .unbalancedEndOfGroup) diagnosticTest("())()", .unbalancedEndOfGroup) + diagnosticTest("[", .expectedCustomCharacterClassMembers) + diagnosticTest("[^", .expectedCustomCharacterClassMembers) + diagnosticTest(#"\u{5"#, .expected("}")) diagnosticTest(#"\x{5"#, .expected("}")) diagnosticTest(#"\N{A"#, .expected("}")) diagnosticTest(#"\N{U+A"#, .expected("}")) - diagnosticTest(#"\p{a"#, .expected("}")) - diagnosticTest(#"\p{a="#, .expected("}")) + diagnosticTest(#"\p{a"#, .unknownProperty(key: nil, value: "a")) + diagnosticTest(#"\p{a="#, .emptyProperty) + diagnosticTest(#"\p{a=}"#, .emptyProperty) + diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) @@ -2214,9 +2277,32 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) - // The first ']' of a custom character class is literal, so this is missing - // the closing bracket. + // MARK: Character classes + + diagnosticTest("[a", .expected("]")) + + // The first ']' of a custom character class is literal, so these are + // missing the closing bracket. diagnosticTest("[]", .expected("]")) + diagnosticTest("(?x)[ ]", .expected("]")) + + diagnosticTest("[&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers) + diagnosticTest("[&&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ && ]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[ &&a]", .expectedCustomCharacterClassMembers) + diagnosticTest("(?x)[a&& ]", .expectedCustomCharacterClassMembers) + + diagnosticTest("[:a", .expected("]")) + diagnosticTest("[:a:", .expected("]")) + diagnosticTest("[[:a", .expected("]")) + diagnosticTest("[[:a:", .expected("]")) + diagnosticTest("[[:a[:]", .expected("]")) + + diagnosticTest("[::]", .emptyProperty) + diagnosticTest("[:=:]", .emptyProperty) + diagnosticTest("[[::]]", .emptyProperty) + diagnosticTest("[[:=:]]", .emptyProperty) // MARK: Bad escapes @@ -2245,17 +2331,28 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) - // MARK: Text Segment options + // MARK: Character properties + + diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx")) + diagnosticTest(#"\p{gcL}"#, .unknownProperty(key: nil, value: "gcL")) + diagnosticTest(#"\p{x=y}"#, .unknownProperty(key: "x", value: "y")) + diagnosticTest(#"\p{aaa(b)}"#, .unknownProperty(key: nil, value: "aaa(b)")) + diagnosticTest("[[:a():]]", .unknownProperty(key: nil, value: "a()")) + diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) + diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) - // MARK: Semantic Level options - diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?a)", .unknownGroupKind("?a")) + diagnosticTest("(?y{)", .expected("g")) + // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" #/ @@ -2313,12 +2410,17 @@ extension RegexTests { diagnosticTest("{5}", .quantifierRequiresOperand("{5}")) diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}")) + // MARK: Unicode scalars + + diagnosticTest(#"\u{G}"#, .expectedNumber("G", kind: .hex)) + // MARK: Matching options diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References @@ -2351,7 +2453,7 @@ extension RegexTests { diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group")) + diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts