diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 6482c4042..9cc2e9a96 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -72,6 +72,9 @@ extension AST { // (*ACCEPT), (*FAIL), ... case backtrackingDirective(BacktrackingDirective) + + // (?i), (?i-m), ... + case changeMatchingOptions(MatchingOptionSequence) } } } @@ -91,6 +94,7 @@ extension AST.Atom { case .subpattern(let v): return v case .callout(let v): return v case .backtrackingDirective(let v): return v + case .changeMatchingOptions(let v): return v case .any: return nil case .startOfLine: return nil case .endOfLine: return nil @@ -691,7 +695,7 @@ extension AST.Atom { return nil case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective: + .callout, .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -731,7 +735,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -740,6 +744,8 @@ extension AST.Atom { switch kind { case .backtrackingDirective(let b): return b.isQuantifiable + case .changeMatchingOptions: + return false // TODO: Are callouts quantifiable? default: return true diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 81e0931ad..a8c4f8b0f 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -68,9 +68,7 @@ extension AST { case atomicScriptRun // (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:) - // Isolated options are written as e.g (?i), and implicitly form a group - // containing all the following elements of the current group. - case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool) + case changeMatchingOptions(MatchingOptionSequence) // NOTE: Comments appear to be groups, but are not parsed // the same. They parse more like quotes, so are not @@ -87,21 +85,6 @@ extension AST.Group.Kind { } } - /// Whether this is a group with an implicit scope, e.g isolated matching - /// options implicitly become parent groups for the rest of the elements in - /// the current group: - /// - /// (a(?i)bc)de -> (a(?i:bc))de - /// - public var hasImplicitScope: Bool { - switch self { - case .changeMatchingOptions(_, let isIsolated): - return isIsolated - default: - return false - } - } - /// If this is a named group, its name, `nil` otherwise. public var name: String? { switch self { diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f70989c9f..e8b7e9e18 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -21,6 +21,16 @@ API convention: - eat() and tryEat() is still used by the parser as a character-by-character interface */ +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is _LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} + extension Source { // MARK: - recordLoc @@ -51,12 +61,8 @@ extension Source { do { guard let result = try f(&self) else { return nil } return Located(result, start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start.. AST.MatchingOptionSequence? { + try tryEating { src in + guard src.tryEat(sequence: "(?"), + let seq = try src.lexMatchingOptionSequence(context: context) + else { return nil } + try src.expect(")") + return seq + } + } + /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { tryEating { src in @@ -846,7 +868,7 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom() else { return nil } + guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } guard src.tryEat("(") else { return nil } if src.tryEat("?") { @@ -871,22 +893,13 @@ extension Source { // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). if let seq = try src.lexMatchingOptionSequence(context: context) { - if src.tryEat(":") { - return .changeMatchingOptions(seq, isIsolated: false) - } - // If this isn't start of an explicit group, we should have an - // implicit group that covers the remaining elements of the current - // group. - // TODO: This implicit scoping behavior matches Oniguruma, but PCRE - // also does it across alternations, which will require additional - // handling. - guard src.tryEat(")") else { + guard src.tryEat(":") else { if let next = src.peek() { throw ParseError.invalidMatchingOption(next) } throw ParseError.expected(")") } - return .changeMatchingOptions(seq, isIsolated: true) + return .changeMatchingOptions(seq) } guard let next = src.peek() else { @@ -1035,18 +1048,8 @@ extension Source { context: ParsingContext ) throws -> Located? { try tryEating { src in - guard src.tryEat(sequence: "(?"), - let group = try src.lexGroupStart(context: context) - else { return nil } - - // Implicitly scoped groups are not supported here. - guard !group.value.hasImplicitScope else { - throw LocatedError( - ParseError.unsupportedCondition("implicitly scoped group"), - group.location - ) - } - return group + guard src.tryEat(sequence: "(?") else { return nil } + return try src.lexGroupStart(context: context) } } @@ -1233,17 +1236,19 @@ extension Source { allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false ) throws -> AST.Reference? { let kind = try recordLoc { src -> AST.Reference.Kind? in - // Note this logic should match canLexNumberedReference. - if src.tryEat("+") { - return .relative(try src.expectNumber().value) - } - if src.tryEat("-") { - return .relative(try -src.expectNumber().value) - } - if let num = try src.lexNumber() { - return .absolute(num.value) + try src.tryEating { src in + // Note this logic should match canLexNumberedReference. + if src.tryEat("+"), let num = try src.lexNumber() { + return .relative(num.value) + } + if src.tryEat("-"), let num = try src.lexNumber() { + return .relative(-num.value) + } + if let num = try src.lexNumber() { + return .absolute(num.value) + } + return nil } - return nil } guard let kind = kind else { return nil } guard allowWholePatternRef || kind.value != .recurseWholePattern else { @@ -1472,8 +1477,21 @@ extension Source { return src.canLexNumberedReference() } + private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { + var src = self + + // See if we can lex a matching option sequence that terminates in ')'. Such + // a sequence is an atom. If an error is thrown, there are invalid elements + // of the matching option sequence. In such a case, we can lex as a group + // and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") + } + /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom() -> Bool { + private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { var src = self guard src.tryEat("(") else { return false } @@ -1487,6 +1505,9 @@ extension Source { // The start of an Oniguruma 'of-contents' callout. if src.tryEat("{") { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + return false } // The start of a backreference directive or Oniguruma named callout. @@ -1747,13 +1768,20 @@ extension Source { /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind { + mutating func expectGroupLikeAtom( + context: ParsingContext + ) throws -> AST.Atom.Kind { try recordLoc { src in // References that look like groups, e.g (?R), (?1), ... if let ref = try src.lexGroupLikeReference() { return ref.value } + // Change matching options atom (?i), (?x-i), ... + if let seq = try src.lexChangeMatchingOptionAtom(context: context) { + return .changeMatchingOptions(seq) + } + // (*ACCEPT), (*FAIL), (*MARK), ... if let b = try src.lexBacktrackingDirective() { return .backtrackingDirective(b) @@ -1822,8 +1850,8 @@ extension Source { // If we have group syntax that was skipped over in lexGroupStart, we // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom() { - return try src.expectGroupLikeAtom() + if !customCC && src.shouldLexGroupLikeAtom(context: context) { + return try src.expectGroupLikeAtom(context: context) } // A quantifier here is invalid. @@ -1841,6 +1869,9 @@ extension Source { } throw Unreachable("TODO: reason") + case "(" where !customCC: + throw Unreachable("Should have lexed a group or group-like atom") + // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .any case "^": return customCC ? .char("^") : .startOfLine diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index b24097b83..975012546 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -282,42 +282,53 @@ extension Parser { loc(start))) } + /// Apply the syntax options of a given matching option sequence to the + /// current set of options. + private mutating func applySyntaxOptions( + of opts: AST.MatchingOptionSequence + ) { + // We skip this for multi-line, as extended syntax is always enabled there. + if context.syntax.contains(.multilineExtendedSyntax) { return } + + // Check if we're introducing or removing extended syntax. + // TODO: PCRE differentiates between (?x) and (?xx) where only the latter + // handles non-semantic whitespace in a custom character class. Other + // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, + // treat (?x) and (?xx) as the same option here. If we ever get a strict + // PCRE mode, we will need to change this to handle that. + if opts.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if opts.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if opts.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } + } + + /// Apply the syntax options of a matching option changing group to the + /// current set of options. + private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + if case .changeMatchingOptions(let seq) = group { + applySyntaxOptions(of: seq) + } + } + /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. We skip this for - // multi-line, as extended syntax is always enabled there. - // TODO: PCRE differentiates between (?x) and (?xx) where only the latter - // handles non-semantic whitespace in a custom character class. Other - // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, - // treat (?x) and (?xx) as the same option here. If we ever get a strict - // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if !context.syntax.contains(.multilineExtendedSyntax) { - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) - } - } - } + applySyntaxOptions(of: kind.value) defer { context.syntax = currentSyntax } let child = try parseNode() - // An implicit scoped group has already consumed its closing paren. - if !kind.value.hasImplicitScope { - try source.expect(")") - } + try source.expect(")") return .init(kind, child, loc(start)) } @@ -409,6 +420,11 @@ extension Parser { } if let atom = try source.lexAtom(context: context) { + // If we have a change matching options atom, apply the syntax options. We + // already take care of scoping syntax options within a group. + if case .changeMatchingOptions(let opts) = atom.kind { + applySyntaxOptions(of: opts) + } // TODO: track source locations return .atom(atom) } diff --git a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift index 0a1fd2136..96562d15c 100644 --- a/Sources/_RegexParser/Regex/Parse/SourceLocation.swift +++ b/Sources/_RegexParser/Regex/Parse/SourceLocation.swift @@ -56,9 +56,11 @@ extension Source { var currentPosition: Position { bounds.lowerBound } } +protocol _LocatedErrorProtocol: Error {} + extension Source { /// An error with source location info - public struct LocatedError: Error { + public struct LocatedError: Error, _LocatedErrorProtocol { public let error: E public let location: SourceLocation @@ -70,7 +72,6 @@ extension Source { self.error = v self.location = Location(r) } - } /// Located value: a value wrapped with a source range diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 986f3d86e..8565b14e9 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -156,6 +156,9 @@ extension AST.Atom { case .backtrackingDirective(let d): return "\(d)" + case .changeMatchingOptions(let opts): + return "changeMatchingOptions<\(opts)>" + case .char, .scalar: fatalError("Unreachable") } @@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable { extension AST.Group.Kind: _ASTPrintable { public var _dumpBase: String { switch self { - case .capture: return "capture" - case .namedCapture(let s): return "capture<\(s.value)>" - case .balancedCapture(let b): return "balanced capture \(b)" - case .nonCapture: return "nonCapture" - case .nonCaptureReset: return "nonCaptureReset" - case .atomicNonCapturing: return "atomicNonCapturing" - case .lookahead: return "lookahead" - case .negativeLookahead: return "negativeLookahead" - case .nonAtomicLookahead: return "nonAtomicLookahead" - case .lookbehind: return "lookbehind" - case .negativeLookbehind: return "negativeLookbehind" - case .nonAtomicLookbehind: return "nonAtomicLookbehind" - case .scriptRun: return "scriptRun" - case .atomicScriptRun: return "atomicScriptRun" - case .changeMatchingOptions(let seq, let isIsolated): - return "changeMatchingOptions<\(seq), \(isIsolated)>" + case .capture: return "capture" + case .namedCapture(let s): return "capture<\(s.value)>" + case .balancedCapture(let b): return "balanced capture \(b)" + case .nonCapture: return "nonCapture" + case .nonCaptureReset: return "nonCaptureReset" + case .atomicNonCapturing: return "atomicNonCapturing" + case .lookahead: return "lookahead" + case .negativeLookahead: return "negativeLookahead" + case .nonAtomicLookahead: return "nonAtomicLookahead" + case .lookbehind: return "lookbehind" + case .negativeLookbehind: return "negativeLookbehind" + case .nonAtomicLookbehind: return "nonAtomicLookbehind" + case .scriptRun: return "scriptRun" + case .atomicScriptRun: return "atomicScriptRun" + case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>" } } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 70233cf4f..c44b5af94 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen { case let .symbolicReference(id): builder.buildUnresolvedReference(id: id) + case let .changeMatchingOptions(optionSequence): + options.apply(optionSequence) + case let .unconverted(astAtom): if let consumer = try astAtom.generateConsumer(options) { builder.buildConsume(by: consumer) @@ -347,7 +350,7 @@ extension Compiler.ByteCodeGen { case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") - case .changeMatchingOptions(let optionSequence, _): + case .changeMatchingOptions(let optionSequence): options.apply(optionSequence) try emitNode(child) @@ -573,6 +576,9 @@ extension Compiler.ByteCodeGen { } case let .capture(_, refId, child): + options.beginScope() + defer { options.endScope() } + let cap = builder.makeCapture(id: refId) switch child { case let .matcher(_, m): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 49e459c0b..b49804ca1 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -100,6 +100,10 @@ extension DSLTree.Atom { // TODO: Should we handle? return nil + case .changeMatchingOptions: + // TODO: Should we handle? + return nil + case let .unconverted(a): return try a.generateConsumer(opts) } @@ -178,7 +182,8 @@ extension AST.Atom { return nil case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective: + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: // FIXME: implement return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1998aa75b..8e1dd8322 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -136,6 +136,8 @@ extension PrettyPrinter { print("/* TOOD: backreferences */") case .symbolicReference: print("/* TOOD: symbolic references */") + case .changeMatchingOptions: + print("/* TODO: change matching options */") } case .trivia: @@ -319,6 +321,9 @@ extension AST.Atom { case .backtrackingDirective: return " /* TODO: backtracking directive */" + + case .changeMatchingOptions: + return "/* TODO: change matching options */" } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index f773bd275..ac88dcd73 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -206,10 +206,11 @@ extension AST.Atom { } switch self.kind { - case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) - case .any: return .any - case let .backreference(r): return .backreference(r) + case let .char(c): return .char(c) + case let .scalar(s): return .scalar(s) + case .any: return .any + case let .backreference(r): return .backreference(r) + case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) case .escaped(let c) where c.scalarValue != nil: return .scalar(c.scalarValue!) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f37505cb4..189b3a22d 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -156,6 +156,8 @@ extension DSLTree { case backreference(AST.Reference) case symbolicReference(ReferenceID) + case changeMatchingOptions(AST.MatchingOptionSequence) + case unconverted(AST.Atom) } } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 38fba02d6..32e73b08c 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -168,7 +168,6 @@ extension RegexComponent { ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( - .changeMatchingOptions(sequence, isIsolated: false), - regex.root)) + .changeMatchingOptions(sequence), regex.root)) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 107a3e3c9..c14454a5c 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -119,9 +119,14 @@ func atomicScriptRun(_ child: AST.Node) -> AST.Node { group(.atomicScriptRun, child) } func changeMatchingOptions( - _ seq: AST.MatchingOptionSequence, isIsolated: Bool, _ child: AST.Node + _ seq: AST.MatchingOptionSequence, _ child: AST.Node ) -> AST.Node { - group(.changeMatchingOptions(seq, isIsolated: isIsolated), child) + group(.changeMatchingOptions(seq), child) +} +func changeMatchingOptions( + _ seq: AST.MatchingOptionSequence +) -> AST.Node { + atom(.changeMatchingOptions(seq)) } func matchingOptions( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 448ff3211..8e92c5936 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1319,6 +1319,17 @@ extension RegexTests { firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + + // Matching option changing persists across alternations. + firstMatchTest(#"a(?s)b|c|.d"#, input: "abc", match: "ab") + firstMatchTest(#"a(?s)b|c|.d"#, input: "c", match: "c") + firstMatchTest(#"a(?s)b|c|.d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?s)(?^)b|c|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nc", match: "\nc") + firstMatchTest(#"a(?s)b|c(?-s)|(?^s).d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?:(?s).b)|.c|.d"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"a(?:(?s).b)|.c"#, input: "a\nc", match: nil) } func testOptionMethods() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 65dd6ed09..bdae250ba 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -464,21 +464,18 @@ extension RegexTests { // classes containing literal ']'. parseTest("[]]", charClass("]")) parseTest("[]a]", charClass("]", "a")) - parseTest( - "(?x)[ ]]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] a ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]", "a")) - ) + parseTest("(?x)[ ]]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] a ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]", "a") + )) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -630,14 +627,13 @@ extension RegexTests { "~~*", concat("~", zeroOrMore(of: "~"))) parseTest( - "[ && ]", charClass( - .setOperation([" "], .init(faking: .intersection), [" ", " "])) + "[ && ]", + charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) ) - parseTest( - "(?x)[ a && b ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"])) - )) + parseTest("(?x)[ a && b ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + )) // MARK: Quotes @@ -832,81 +828,67 @@ extension RegexTests { // Matching option changing groups. parseTest("(?-)", changeMatchingOptions( - matchingOptions(), isIsolated: true, empty()) - ) + matchingOptions() + )) parseTest("(?i)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive) + )) parseTest("(?m)", changeMatchingOptions( - matchingOptions(adding: .multiline), - isIsolated: true, empty()) - ) + matchingOptions(adding: .multiline) + )) parseTest("(?x)", changeMatchingOptions( - matchingOptions(adding: .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extended) + )) parseTest("(?xx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended) + )) parseTest("(?xxx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended, .extended) + )) parseTest("(?P)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: true, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps) + )) parseTest("(?-i)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(removing: .caseInsensitive) + )) parseTest("(?i-s)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive, removing: .singleLine), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive, removing: .singleLine) + )) parseTest("(?i-is)", changeMatchingOptions( matchingOptions(adding: .caseInsensitive, - removing: .caseInsensitive, .singleLine), - isIsolated: true, empty()) - ) + removing: .caseInsensitive, .singleLine) + )) parseTest("(?:)", nonCapture(empty())) parseTest("(?-:)", changeMatchingOptions( - matchingOptions(), isIsolated: false, empty()) - ) + matchingOptions(), empty() + )) parseTest("(?i:)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(adding: .caseInsensitive), empty() + )) parseTest("(?-i:)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(removing: .caseInsensitive), empty() + )) parseTest("(?P:)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: false, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps), empty() + )) parseTest("(?^)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: true, empty()) - ) + unsetMatchingOptions() + )) parseTest("(?^:)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: false, empty()) - ) + unsetMatchingOptions(), empty() + )) parseTest("(?^ims:)", changeMatchingOptions( unsetMatchingOptions(adding: .caseInsensitive, .multiline, .singleLine), - isIsolated: false, empty()) - ) + empty() + )) parseTest("(?^J:)", changeMatchingOptions( - unsetMatchingOptions(adding: .allowDuplicateGroupNames), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .allowDuplicateGroupNames), empty() + )) parseTest("(?^y{w}:)", changeMatchingOptions( - unsetMatchingOptions(adding: .textSegmentWordMode), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .textSegmentWordMode), empty() + )) let allOptions: [AST.MatchingOption.Kind] = [ .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture, @@ -917,50 +899,64 @@ extension RegexTests { .byteSemantics ] parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: true, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)) + )) parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: false, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)), empty() + )) parseTest( - "a(b(?i)c)d", concat("a", capture(concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, "c"))), "d"), + "a(b(?i)c)d", concat( + "a", + capture(concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + )), + "d" + ), captures: .atom() ) parseTest( - "(a(?i)b(c)d)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", capture("c"), "d")))), + "(a(?i)b(c)d)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + capture("c"), + "d" + )), captures: .tuple(.atom(), .atom()) ) parseTest( - "(a(?i)b(?#hello)c)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", "c")))), + "(a(?i)b(?#hello)c)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + "c" + )), captures: .atom() ) - // TODO: This is Oniguruma's behavior, but PCRE treats it as: - // ab(?i:c)|(?i:def)|(?i:gh) - // instead. We ought to have a mode to emulate that. - parseTest("ab(?i)c|def|gh", concat("a", "b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", concat("d", "e", "f"), concat("g", "h"))))) + parseTest("ab(?i)c|def|gh", alt( + concat( + "a", + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + concat("d", "e", "f"), + concat("g", "h") + )) - parseTest("(a|b(?i)c|d)", capture(alt("a", concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", "d"))))), - captures: .atom()) + parseTest("(a|b(?i)c|d)", capture(alt( + "a", + concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + "d" + )), captures: .atom()) // MARK: References @@ -1479,149 +1475,149 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) - parseTest("(?x) # hello", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, empty())) - parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, empty())) - parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, concat("#", "a", "b", "c"))) - parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, concat(" "))) + // MARK: Matching option changing parseTest( - "(?x) a (?^) b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat( - "a", - changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) - ) - ) + "(?x) # hello", + changeMatchingOptions(matchingOptions(adding: .extended)) ) - - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. parseTest( - "(?x)[ # abc]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("#", "a", "b", "c")) + "(?xx) # hello", + changeMatchingOptions(matchingOptions(adding: .extraExtended)) ) + parseTest("(?x) \\# abc", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "#", "a", "b", "c" + )) + parseTest("(?xx) \\ ", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), " " + )) parseTest( - "(?x)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?xx)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?x)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?-xx)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extraExtended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?xx)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^i)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .extended), - isIsolated: true, concat("d", "e", "f")))) - ) - parseTest( - "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c"))), "d", " ", "e", " ", "f") - ) - parseTest( - "(?x:a b c)# hi", concat(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: false, - concat("a", "b", "c")), "#", " ", "h", "i") + "(?x) a (?^) b", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions()), + " ", "b" + ) ) - parseTest( - "(?x-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xxx-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xx-i)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true, - concat("a", "b", "c")) + // End of line comments aren't applicable in custom char classes. + // TODO: ICU supports this. + parseTest("(?x)[ # abc]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("#", "a", "b", "c") + )) + + parseTest("(?x)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?xx)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?x)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?-xx)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extraExtended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?xx)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^i)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .extended)), + "d", "e", "f" + )) + parseTest("(?:(?x)a b c)d e f", concat( + nonCapture(concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c" + )), + "d", " ", "e", " ", "f" + )) + parseTest("(?x:a b c)# hi", concat(changeMatchingOptions( + matchingOptions(adding: .extended), + concat("a", "b", "c")), "#", " ", "h", "i") ) - // PCRE states that whitespace seperating quantifiers is permitted under - // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 - parseTest( - "(?x)a *", + parseTest("(?x-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - zeroOrMore(of: "a")) - ) - parseTest( - "(?x)a + ?", + matchingOptions(adding: .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xxx-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - oneOrMore(.reluctant, of: "a")) - ) - parseTest( - "(?x)a {2,4}", + matchingOptions(adding: .extraExtended, .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xx-i)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - quantRange(2 ... 4, of: "a")) - ) + matchingOptions(adding: .extraExtended, removing: .caseInsensitive) + ), + "a", "b", "c" + )) + + // PCRE states that whitespace seperating quantifiers is permitted under + // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 + parseTest("(?x)a *", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + zeroOrMore(of: "a") + )) + parseTest("(?x)a + ?", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + oneOrMore(.reluctant, of: "a") + )) + parseTest("(?x)a {2,4}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + quantRange(2 ... 4, of: "a") + )) // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 // TODO: We ought to warn on this, and produce a range anyway. - parseTest( - "(?x)a{1, 3}", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "{", "1", ",", "3", "}")) - ) + parseTest("(?x)a{1, 3}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "{", "1", ",", "3", "}" + )) // Test that we cover the list of whitespace characters covered by PCRE. parseTest( "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, concat("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b" + )) parseTest( "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("a", "b") + )) + + parseTest(#"(?i:)?"#, zeroOrOne(of: changeMatchingOptions( + matchingOptions(adding: .caseInsensitive), empty() + ))) // Test multi-line comment handling. parseTest( @@ -1843,10 +1839,10 @@ extension RegexTests { parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( - "#|(?-x)[a b]|#", changeMatchingOptions( - matchingOptions(removing: .extended), isIsolated: true, - charClass("a", " ", "b")) - ) + "#|(?-x)[a b]|#", concat( + changeMatchingOptions(matchingOptions(removing: .extended)), + charClass("a", " ", "b") + )) parseWithDelimitersTest("#|[[a ] b]|#", charClass(charClass("a"), "b")) // Non-semantic whitespace between quantifier characters for consistency @@ -1856,8 +1852,7 @@ extension RegexTests { // End-of-line comments aren't enabled by default in experimental syntax. parseWithDelimitersTest("#|#abc|#", concat("#", "a", "b", "c")) parseWithDelimitersTest("#|(?x)#abc|#", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - empty()) + matchingOptions(adding: .extended)) ) parseWithDelimitersTest("#|||#", alt(empty(), empty())) @@ -1913,8 +1908,7 @@ extension RegexTests { (?^) # comment /# - """, changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, empty()) + """, changeMatchingOptions(unsetMatchingOptions()) ) // (?x) has no effect. @@ -1923,8 +1917,7 @@ extension RegexTests { (?x) # comment /# - """, changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, empty()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter @@ -2328,17 +2321,18 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) - // MARK: Text Segment options + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) - // MARK: Semantic Level options - diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?a)", .unknownGroupKind("?a")) + diagnosticTest("(?y{)", .expected("g")) + // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" #/ @@ -2406,6 +2400,7 @@ extension RegexTests { diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References @@ -2438,7 +2433,7 @@ extension RegexTests { diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group")) + diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts