From 6f7ab9650c156b7aad8838f2bff6e71e5af50342 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Apr 2022 20:04:50 +0100 Subject: [PATCH 1/2] Throw error if we encounter stray opening '(' This should be unreachable, let's make sure of that. Doing so requires generalizing the handling of LocatedError a bit. --- .../Regex/Parse/LexicalAnalysis.swift | 21 +++++++++++++------ .../Regex/Parse/SourceLocation.swift | 5 +++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index f70989c9f..daa629055 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -21,6 +21,16 @@ API convention: - eat() and tryEat() is still used by the parser as a character-by-character interface */ +extension Error { + func addingLocation(_ loc: Range) -> Error { + // If we're already a LocatedError, don't change the location. + if self is _LocatedErrorProtocol { + return self + } + return Source.LocatedError(self, loc) + } +} + extension Source { // MARK: - recordLoc @@ -51,12 +61,8 @@ extension Source { do { guard let result = try f(&self) else { return nil } return Located(result, start.. { - throw e - } catch let e as ParseError { - throw LocatedError(e, start..: Error { + public struct LocatedError: Error, _LocatedErrorProtocol { public let error: E public let location: SourceLocation @@ -70,7 +72,6 @@ extension Source { self.error = v self.location = Location(r) } - } /// Located value: a value wrapped with a source range From aede1f79ab228c80f24103bffcbd3beb303bdac6 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 14 Apr 2022 20:04:51 +0100 Subject: [PATCH 2/2] Change matching option scoping behavior to match PCRE Previously we would always parse a "change matching option" sequence as a group, and for the isolated syntax e.g `(?x)`, we would have it implicitly wrap everything after in the same group by having it do a recursive parse. This matched the Oniguruma behavior for such isolated groups, and was easy to implement, but its behavior is quite unintuitive when it comes to alternations, as e.g `a(?x)b|c` becomes `a(?x:b|c)`, which may not be expected. Instead, let's follow PCRE's behavior by having such isolated cases affect the syntax options for the remainder of the current group, including across alternation branches. This is done by lexing such cases as atoms (as they aren't really group-like anymore), and having them change the syntax options when we encounter them. The existing scoping rules around groups take care of resetting the options when we exit the scope. --- Sources/_RegexParser/Regex/AST/Atom.swift | 10 +- Sources/_RegexParser/Regex/AST/Group.swift | 19 +- .../Regex/Parse/LexicalAnalysis.swift | 98 ++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 64 ++- .../_RegexParser/Regex/Printing/DumpAST.swift | 34 +- Sources/_StringProcessing/ByteCodeGen.swift | 8 +- .../_StringProcessing/ConsumerInterface.swift | 7 +- .../_StringProcessing/PrintAsPattern.swift | 5 + .../Regex/ASTConversion.swift | 9 +- Sources/_StringProcessing/Regex/DSLTree.swift | 2 + Sources/_StringProcessing/Regex/Options.swift | 3 +- .../Utility/ASTBuilder.swift | 9 +- Tests/RegexTests/MatchTests.swift | 11 + Tests/RegexTests/ParseTests.swift | 471 +++++++++--------- 14 files changed, 404 insertions(+), 346 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 6482c4042..9cc2e9a96 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -72,6 +72,9 @@ extension AST { // (*ACCEPT), (*FAIL), ... case backtrackingDirective(BacktrackingDirective) + + // (?i), (?i-m), ... + case changeMatchingOptions(MatchingOptionSequence) } } } @@ -91,6 +94,7 @@ extension AST.Atom { case .subpattern(let v): return v case .callout(let v): return v case .backtrackingDirective(let v): return v + case .changeMatchingOptions(let v): return v case .any: return nil case .startOfLine: return nil case .endOfLine: return nil @@ -691,7 +695,7 @@ extension AST.Atom { return nil case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, - .callout, .backtrackingDirective: + .callout, .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -731,7 +735,7 @@ extension AST.Atom { case .property, .escaped, .any, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + .backtrackingDirective, .changeMatchingOptions: return nil } } @@ -740,6 +744,8 @@ extension AST.Atom { switch kind { case .backtrackingDirective(let b): return b.isQuantifiable + case .changeMatchingOptions: + return false // TODO: Are callouts quantifiable? default: return true diff --git a/Sources/_RegexParser/Regex/AST/Group.swift b/Sources/_RegexParser/Regex/AST/Group.swift index 81e0931ad..a8c4f8b0f 100644 --- a/Sources/_RegexParser/Regex/AST/Group.swift +++ b/Sources/_RegexParser/Regex/AST/Group.swift @@ -68,9 +68,7 @@ extension AST { case atomicScriptRun // (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:) - // Isolated options are written as e.g (?i), and implicitly form a group - // containing all the following elements of the current group. - case changeMatchingOptions(MatchingOptionSequence, isIsolated: Bool) + case changeMatchingOptions(MatchingOptionSequence) // NOTE: Comments appear to be groups, but are not parsed // the same. They parse more like quotes, so are not @@ -87,21 +85,6 @@ extension AST.Group.Kind { } } - /// Whether this is a group with an implicit scope, e.g isolated matching - /// options implicitly become parent groups for the rest of the elements in - /// the current group: - /// - /// (a(?i)bc)de -> (a(?i:bc))de - /// - public var hasImplicitScope: Bool { - switch self { - case .changeMatchingOptions(_, let isIsolated): - return isIsolated - default: - return false - } - } - /// If this is a named group, its name, `nil` otherwise. public var name: String? { switch self { diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index daa629055..e8b7e9e18 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -712,6 +712,22 @@ extension Source { return .init(caretLoc: nil, adding: adding, minusLoc: nil, removing: []) } + /// A matching option changing atom. + /// + /// '(?' MatchingOptionSeq ')' + /// + mutating func lexChangeMatchingOptionAtom( + context: ParsingContext + ) throws -> AST.MatchingOptionSequence? { + try tryEating { src in + guard src.tryEat(sequence: "(?"), + let seq = try src.lexMatchingOptionSequence(context: context) + else { return nil } + try src.expect(")") + return seq + } + } + /// Try to consume explicitly spelled-out PCRE2 group syntax. mutating func lexExplicitPCRE2GroupStart() -> AST.Group.Kind? { tryEating { src in @@ -852,7 +868,7 @@ extension Source { // otherwise a matching option specifier. Conversely, '(?P' can be the // start of a matching option sequence, or a reference if it is followed // by '=' or '<'. - guard !src.shouldLexGroupLikeAtom() else { return nil } + guard !src.shouldLexGroupLikeAtom(context: context) else { return nil } guard src.tryEat("(") else { return nil } if src.tryEat("?") { @@ -877,22 +893,13 @@ extension Source { // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). if let seq = try src.lexMatchingOptionSequence(context: context) { - if src.tryEat(":") { - return .changeMatchingOptions(seq, isIsolated: false) - } - // If this isn't start of an explicit group, we should have an - // implicit group that covers the remaining elements of the current - // group. - // TODO: This implicit scoping behavior matches Oniguruma, but PCRE - // also does it across alternations, which will require additional - // handling. - guard src.tryEat(")") else { + guard src.tryEat(":") else { if let next = src.peek() { throw ParseError.invalidMatchingOption(next) } throw ParseError.expected(")") } - return .changeMatchingOptions(seq, isIsolated: true) + return .changeMatchingOptions(seq) } guard let next = src.peek() else { @@ -1041,18 +1048,8 @@ extension Source { context: ParsingContext ) throws -> Located? { try tryEating { src in - guard src.tryEat(sequence: "(?"), - let group = try src.lexGroupStart(context: context) - else { return nil } - - // Implicitly scoped groups are not supported here. - guard !group.value.hasImplicitScope else { - throw LocatedError( - ParseError.unsupportedCondition("implicitly scoped group"), - group.location - ) - } - return group + guard src.tryEat(sequence: "(?") else { return nil } + return try src.lexGroupStart(context: context) } } @@ -1239,17 +1236,19 @@ extension Source { allowWholePatternRef: Bool = false, allowRecursionLevel: Bool = false ) throws -> AST.Reference? { let kind = try recordLoc { src -> AST.Reference.Kind? in - // Note this logic should match canLexNumberedReference. - if src.tryEat("+") { - return .relative(try src.expectNumber().value) - } - if src.tryEat("-") { - return .relative(try -src.expectNumber().value) - } - if let num = try src.lexNumber() { - return .absolute(num.value) + try src.tryEating { src in + // Note this logic should match canLexNumberedReference. + if src.tryEat("+"), let num = try src.lexNumber() { + return .relative(num.value) + } + if src.tryEat("-"), let num = try src.lexNumber() { + return .relative(-num.value) + } + if let num = try src.lexNumber() { + return .absolute(num.value) + } + return nil } - return nil } guard let kind = kind else { return nil } guard allowWholePatternRef || kind.value != .recurseWholePattern else { @@ -1478,8 +1477,21 @@ extension Source { return src.canLexNumberedReference() } + private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool { + var src = self + + // See if we can lex a matching option sequence that terminates in ')'. Such + // a sequence is an atom. If an error is thrown, there are invalid elements + // of the matching option sequence. In such a case, we can lex as a group + // and diagnose the invalid group kind. + guard (try? src.lexMatchingOptionSequence(context: context)) != nil else { + return false + } + return src.tryEat(")") + } + /// Whether a group specifier should be lexed as an atom instead of a group. - private func shouldLexGroupLikeAtom() -> Bool { + private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool { var src = self guard src.tryEat("(") else { return false } @@ -1493,6 +1505,9 @@ extension Source { // The start of an Oniguruma 'of-contents' callout. if src.tryEat("{") { return true } + // A matching option atom (?x), (?i), ... + if src.canLexMatchingOptionsAsAtom(context: context) { return true } + return false } // The start of a backreference directive or Oniguruma named callout. @@ -1753,13 +1768,20 @@ extension Source { /// /// GroupLikeAtom -> GroupLikeReference | Callout | BacktrackingDirective /// - mutating func expectGroupLikeAtom() throws -> AST.Atom.Kind { + mutating func expectGroupLikeAtom( + context: ParsingContext + ) throws -> AST.Atom.Kind { try recordLoc { src in // References that look like groups, e.g (?R), (?1), ... if let ref = try src.lexGroupLikeReference() { return ref.value } + // Change matching options atom (?i), (?x-i), ... + if let seq = try src.lexChangeMatchingOptionAtom(context: context) { + return .changeMatchingOptions(seq) + } + // (*ACCEPT), (*FAIL), (*MARK), ... if let b = try src.lexBacktrackingDirective() { return .backtrackingDirective(b) @@ -1828,8 +1850,8 @@ extension Source { // If we have group syntax that was skipped over in lexGroupStart, we // need to handle it as an atom, or throw an error. - if !customCC && src.shouldLexGroupLikeAtom() { - return try src.expectGroupLikeAtom() + if !customCC && src.shouldLexGroupLikeAtom(context: context) { + return try src.expectGroupLikeAtom(context: context) } // A quantifier here is invalid. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index b24097b83..975012546 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -282,42 +282,53 @@ extension Parser { loc(start))) } + /// Apply the syntax options of a given matching option sequence to the + /// current set of options. + private mutating func applySyntaxOptions( + of opts: AST.MatchingOptionSequence + ) { + // We skip this for multi-line, as extended syntax is always enabled there. + if context.syntax.contains(.multilineExtendedSyntax) { return } + + // Check if we're introducing or removing extended syntax. + // TODO: PCRE differentiates between (?x) and (?xx) where only the latter + // handles non-semantic whitespace in a custom character class. Other + // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, + // treat (?x) and (?xx) as the same option here. If we ever get a strict + // PCRE mode, we will need to change this to handle that. + if opts.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if opts.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if opts.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } + } + + /// Apply the syntax options of a matching option changing group to the + /// current set of options. + private mutating func applySyntaxOptions(of group: AST.Group.Kind) { + if case .changeMatchingOptions(let seq) = group { + applySyntaxOptions(of: seq) + } + } + /// Perform a recursive parse for the body of a group. mutating func parseGroupBody( start: Source.Position, _ kind: AST.Located ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. We skip this for - // multi-line, as extended syntax is always enabled there. - // TODO: PCRE differentiates between (?x) and (?xx) where only the latter - // handles non-semantic whitespace in a custom character class. Other - // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, - // treat (?x) and (?xx) as the same option here. If we ever get a strict - // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if !context.syntax.contains(.multilineExtendedSyntax) { - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) - } - } - } + applySyntaxOptions(of: kind.value) defer { context.syntax = currentSyntax } let child = try parseNode() - // An implicit scoped group has already consumed its closing paren. - if !kind.value.hasImplicitScope { - try source.expect(")") - } + try source.expect(")") return .init(kind, child, loc(start)) } @@ -409,6 +420,11 @@ extension Parser { } if let atom = try source.lexAtom(context: context) { + // If we have a change matching options atom, apply the syntax options. We + // already take care of scoping syntax options within a group. + if case .changeMatchingOptions(let opts) = atom.kind { + applySyntaxOptions(of: opts) + } // TODO: track source locations return .atom(atom) } diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 986f3d86e..8565b14e9 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -156,6 +156,9 @@ extension AST.Atom { case .backtrackingDirective(let d): return "\(d)" + case .changeMatchingOptions(let opts): + return "changeMatchingOptions<\(opts)>" + case .char, .scalar: fatalError("Unreachable") } @@ -225,22 +228,21 @@ extension AST.Reference: _ASTPrintable { extension AST.Group.Kind: _ASTPrintable { public var _dumpBase: String { switch self { - case .capture: return "capture" - case .namedCapture(let s): return "capture<\(s.value)>" - case .balancedCapture(let b): return "balanced capture \(b)" - case .nonCapture: return "nonCapture" - case .nonCaptureReset: return "nonCaptureReset" - case .atomicNonCapturing: return "atomicNonCapturing" - case .lookahead: return "lookahead" - case .negativeLookahead: return "negativeLookahead" - case .nonAtomicLookahead: return "nonAtomicLookahead" - case .lookbehind: return "lookbehind" - case .negativeLookbehind: return "negativeLookbehind" - case .nonAtomicLookbehind: return "nonAtomicLookbehind" - case .scriptRun: return "scriptRun" - case .atomicScriptRun: return "atomicScriptRun" - case .changeMatchingOptions(let seq, let isIsolated): - return "changeMatchingOptions<\(seq), \(isIsolated)>" + case .capture: return "capture" + case .namedCapture(let s): return "capture<\(s.value)>" + case .balancedCapture(let b): return "balanced capture \(b)" + case .nonCapture: return "nonCapture" + case .nonCaptureReset: return "nonCaptureReset" + case .atomicNonCapturing: return "atomicNonCapturing" + case .lookahead: return "lookahead" + case .negativeLookahead: return "negativeLookahead" + case .nonAtomicLookahead: return "nonAtomicLookahead" + case .lookbehind: return "lookbehind" + case .negativeLookbehind: return "negativeLookbehind" + case .nonAtomicLookbehind: return "nonAtomicLookbehind" + case .scriptRun: return "scriptRun" + case .atomicScriptRun: return "atomicScriptRun" + case .changeMatchingOptions(let seq): return "changeMatchingOptions<\(seq)>" } } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 70233cf4f..c44b5af94 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -34,6 +34,9 @@ extension Compiler.ByteCodeGen { case let .symbolicReference(id): builder.buildUnresolvedReference(id: id) + case let .changeMatchingOptions(optionSequence): + options.apply(optionSequence) + case let .unconverted(astAtom): if let consumer = try astAtom.generateConsumer(options) { builder.buildConsume(by: consumer) @@ -347,7 +350,7 @@ extension Compiler.ByteCodeGen { case .capture, .namedCapture, .balancedCapture: throw Unreachable("These should produce a capture node") - case .changeMatchingOptions(let optionSequence, _): + case .changeMatchingOptions(let optionSequence): options.apply(optionSequence) try emitNode(child) @@ -573,6 +576,9 @@ extension Compiler.ByteCodeGen { } case let .capture(_, refId, child): + options.beginScope() + defer { options.endScope() } + let cap = builder.makeCapture(id: refId) switch child { case let .matcher(_, m): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 49e459c0b..b49804ca1 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -100,6 +100,10 @@ extension DSLTree.Atom { // TODO: Should we handle? return nil + case .changeMatchingOptions: + // TODO: Should we handle? + return nil + case let .unconverted(a): return try a.generateConsumer(opts) } @@ -178,7 +182,8 @@ extension AST.Atom { return nil case .escaped, .keyboardControl, .keyboardMeta, .keyboardMetaControl, - .backreference, .subpattern, .callout, .backtrackingDirective: + .backreference, .subpattern, .callout, .backtrackingDirective, + .changeMatchingOptions: // FIXME: implement return nil } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 1998aa75b..8e1dd8322 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -136,6 +136,8 @@ extension PrettyPrinter { print("/* TOOD: backreferences */") case .symbolicReference: print("/* TOOD: symbolic references */") + case .changeMatchingOptions: + print("/* TODO: change matching options */") } case .trivia: @@ -319,6 +321,9 @@ extension AST.Atom { case .backtrackingDirective: return " /* TODO: backtracking directive */" + + case .changeMatchingOptions: + return "/* TODO: change matching options */" } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index f773bd275..ac88dcd73 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -206,10 +206,11 @@ extension AST.Atom { } switch self.kind { - case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) - case .any: return .any - case let .backreference(r): return .backreference(r) + case let .char(c): return .char(c) + case let .scalar(s): return .scalar(s) + case .any: return .any + case let .backreference(r): return .backreference(r) + case let .changeMatchingOptions(seq): return .changeMatchingOptions(seq) case .escaped(let c) where c.scalarValue != nil: return .scalar(c.scalarValue!) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f37505cb4..189b3a22d 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -156,6 +156,8 @@ extension DSLTree { case backreference(AST.Reference) case symbolicReference(ReferenceID) + case changeMatchingOptions(AST.MatchingOptionSequence) + case unconverted(AST.Atom) } } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 38fba02d6..32e73b08c 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -168,7 +168,6 @@ extension RegexComponent { ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( - .changeMatchingOptions(sequence, isIsolated: false), - regex.root)) + .changeMatchingOptions(sequence), regex.root)) } } diff --git a/Sources/_StringProcessing/Utility/ASTBuilder.swift b/Sources/_StringProcessing/Utility/ASTBuilder.swift index 107a3e3c9..c14454a5c 100644 --- a/Sources/_StringProcessing/Utility/ASTBuilder.swift +++ b/Sources/_StringProcessing/Utility/ASTBuilder.swift @@ -119,9 +119,14 @@ func atomicScriptRun(_ child: AST.Node) -> AST.Node { group(.atomicScriptRun, child) } func changeMatchingOptions( - _ seq: AST.MatchingOptionSequence, isIsolated: Bool, _ child: AST.Node + _ seq: AST.MatchingOptionSequence, _ child: AST.Node ) -> AST.Node { - group(.changeMatchingOptions(seq, isIsolated: isIsolated), child) + group(.changeMatchingOptions(seq), child) +} +func changeMatchingOptions( + _ seq: AST.MatchingOptionSequence +) -> AST.Node { + atom(.changeMatchingOptions(seq)) } func matchingOptions( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 448ff3211..8e92c5936 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1319,6 +1319,17 @@ extension RegexTests { firstMatchTest(#"(((?s)a)).b"#, input: "a\nb", match: nil) firstMatchTest(#"(?s)(((?-s)a)).b"#, input: "a\nb", match: "a\nb") firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") + + // Matching option changing persists across alternations. + firstMatchTest(#"a(?s)b|c|.d"#, input: "abc", match: "ab") + firstMatchTest(#"a(?s)b|c|.d"#, input: "c", match: "c") + firstMatchTest(#"a(?s)b|c|.d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?s)(?^)b|c|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nd", match: nil) + firstMatchTest(#"a(?s)b|.c(?-s)|.d"#, input: "a\nc", match: "\nc") + firstMatchTest(#"a(?s)b|c(?-s)|(?^s).d"#, input: "a\nd", match: "\nd") + firstMatchTest(#"a(?:(?s).b)|.c|.d"#, input: "a\nb", match: "a\nb") + firstMatchTest(#"a(?:(?s).b)|.c"#, input: "a\nc", match: nil) } func testOptionMethods() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 65dd6ed09..bdae250ba 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -464,21 +464,18 @@ extension RegexTests { // classes containing literal ']'. parseTest("[]]", charClass("]")) parseTest("[]a]", charClass("]", "a")) - parseTest( - "(?x)[ ]]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]")) - ) - parseTest( - "(?x)[ ] a ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("]", "a")) - ) + parseTest("(?x)[ ]]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]") + )) + parseTest("(?x)[ ] a ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("]", "a") + )) // These are metacharacters in certain contexts, but normal characters // otherwise. @@ -630,14 +627,13 @@ extension RegexTests { "~~*", concat("~", zeroOrMore(of: "~"))) parseTest( - "[ && ]", charClass( - .setOperation([" "], .init(faking: .intersection), [" ", " "])) + "[ && ]", + charClass(.setOperation([" "], .init(faking: .intersection), [" ", " "])) ) - parseTest( - "(?x)[ a && b ]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass( - .setOperation(["a"], .init(faking: .intersection), ["b"])) - )) + parseTest("(?x)[ a && b ]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass(.setOperation(["a"], .init(faking: .intersection), ["b"])) + )) // MARK: Quotes @@ -832,81 +828,67 @@ extension RegexTests { // Matching option changing groups. parseTest("(?-)", changeMatchingOptions( - matchingOptions(), isIsolated: true, empty()) - ) + matchingOptions() + )) parseTest("(?i)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive) + )) parseTest("(?m)", changeMatchingOptions( - matchingOptions(adding: .multiline), - isIsolated: true, empty()) - ) + matchingOptions(adding: .multiline) + )) parseTest("(?x)", changeMatchingOptions( - matchingOptions(adding: .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extended) + )) parseTest("(?xx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended) + )) parseTest("(?xxx)", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended), - isIsolated: true, empty()) - ) + matchingOptions(adding: .extraExtended, .extended) + )) parseTest("(?P)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: true, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps) + )) parseTest("(?-i)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: true, empty()) - ) + matchingOptions(removing: .caseInsensitive) + )) parseTest("(?i-s)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive, removing: .singleLine), - isIsolated: true, empty()) - ) + matchingOptions(adding: .caseInsensitive, removing: .singleLine) + )) parseTest("(?i-is)", changeMatchingOptions( matchingOptions(adding: .caseInsensitive, - removing: .caseInsensitive, .singleLine), - isIsolated: true, empty()) - ) + removing: .caseInsensitive, .singleLine) + )) parseTest("(?:)", nonCapture(empty())) parseTest("(?-:)", changeMatchingOptions( - matchingOptions(), isIsolated: false, empty()) - ) + matchingOptions(), empty() + )) parseTest("(?i:)", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(adding: .caseInsensitive), empty() + )) parseTest("(?-i:)", changeMatchingOptions( - matchingOptions(removing: .caseInsensitive), - isIsolated: false, empty()) - ) + matchingOptions(removing: .caseInsensitive), empty() + )) parseTest("(?P:)", changeMatchingOptions( - matchingOptions(adding: .asciiOnlyPOSIXProps), isIsolated: false, empty()) - ) + matchingOptions(adding: .asciiOnlyPOSIXProps), empty() + )) parseTest("(?^)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: true, empty()) - ) + unsetMatchingOptions() + )) parseTest("(?^:)", changeMatchingOptions( - unsetMatchingOptions(), - isIsolated: false, empty()) - ) + unsetMatchingOptions(), empty() + )) parseTest("(?^ims:)", changeMatchingOptions( unsetMatchingOptions(adding: .caseInsensitive, .multiline, .singleLine), - isIsolated: false, empty()) - ) + empty() + )) parseTest("(?^J:)", changeMatchingOptions( - unsetMatchingOptions(adding: .allowDuplicateGroupNames), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .allowDuplicateGroupNames), empty() + )) parseTest("(?^y{w}:)", changeMatchingOptions( - unsetMatchingOptions(adding: .textSegmentWordMode), - isIsolated: false, empty()) - ) + unsetMatchingOptions(adding: .textSegmentWordMode), empty() + )) let allOptions: [AST.MatchingOption.Kind] = [ .caseInsensitive, .allowDuplicateGroupNames, .multiline, .noAutoCapture, @@ -917,50 +899,64 @@ extension RegexTests { .byteSemantics ] parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: true, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)) + )) parseTest("(?iJmnsUxxxwDPSWy{g}y{w}Xub-iJmnsUxxxwDPSW:)", changeMatchingOptions( - matchingOptions( - adding: allOptions, - removing: allOptions.dropLast(5) - ), - isIsolated: false, empty()) - ) + matchingOptions(adding: allOptions, removing: allOptions.dropLast(5)), empty() + )) parseTest( - "a(b(?i)c)d", concat("a", capture(concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, "c"))), "d"), + "a(b(?i)c)d", concat( + "a", + capture(concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + )), + "d" + ), captures: .atom() ) parseTest( - "(a(?i)b(c)d)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", capture("c"), "d")))), + "(a(?i)b(c)d)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + capture("c"), + "d" + )), captures: .tuple(.atom(), .atom()) ) parseTest( - "(a(?i)b(?#hello)c)", capture(concat("a", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("b", "c")))), + "(a(?i)b(?#hello)c)", capture(concat( + "a", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "b", + "c" + )), captures: .atom() ) - // TODO: This is Oniguruma's behavior, but PCRE treats it as: - // ab(?i:c)|(?i:def)|(?i:gh) - // instead. We ought to have a mode to emulate that. - parseTest("ab(?i)c|def|gh", concat("a", "b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", concat("d", "e", "f"), concat("g", "h"))))) + parseTest("ab(?i)c|def|gh", alt( + concat( + "a", + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + concat("d", "e", "f"), + concat("g", "h") + )) - parseTest("(a|b(?i)c|d)", capture(alt("a", concat("b", changeMatchingOptions( - matchingOptions(adding: .caseInsensitive), isIsolated: true, - alt("c", "d"))))), - captures: .atom()) + parseTest("(a|b(?i)c|d)", capture(alt( + "a", + concat( + "b", + changeMatchingOptions(matchingOptions(adding: .caseInsensitive)), + "c" + ), + "d" + )), captures: .atom()) // MARK: References @@ -1479,149 +1475,149 @@ extension RegexTests { parseTest("[(?#abc)]", charClass("(", "?", "#", "a", "b", "c", ")")) parseTest("# abc", concat("#", " ", "a", "b", "c")) - parseTest("(?x) # hello", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, empty())) - parseTest("(?xx) # hello", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, empty())) - parseTest("(?x) \\# abc", changeMatchingOptions(matchingOptions( - adding: .extended), isIsolated: true, concat("#", "a", "b", "c"))) - parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( - adding: .extraExtended), isIsolated: true, concat(" "))) + // MARK: Matching option changing parseTest( - "(?x) a (?^) b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat( - "a", - changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) - ) - ) + "(?x) # hello", + changeMatchingOptions(matchingOptions(adding: .extended)) ) - - // End of line comments aren't applicable in custom char classes. - // TODO: ICU supports this. parseTest( - "(?x)[ # abc]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - charClass("#", "a", "b", "c")) + "(?xx) # hello", + changeMatchingOptions(matchingOptions(adding: .extraExtended)) ) + parseTest("(?x) \\# abc", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "#", "a", "b", "c" + )) + parseTest("(?xx) \\ ", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), " " + )) parseTest( - "(?x)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?xx)a b c[d e f]", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", charClass("d", "e", "f"))) - ) - parseTest( - "(?x)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?-xx)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extraExtended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?xx)a b c(?-x)d e f", changeMatchingOptions( - matchingOptions(adding: .extraExtended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(matchingOptions(removing: .extended), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^i)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive), - isIsolated: true, concat("d", " ", "e", " ", "f")))) - ) - parseTest( - "(?x)a b c(?^x)d e f", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c", - changeMatchingOptions(unsetMatchingOptions(adding: .extended), - isIsolated: true, concat("d", "e", "f")))) - ) - parseTest( - "(?:(?x)a b c)d e f", concat(nonCapture(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "b", "c"))), "d", " ", "e", " ", "f") - ) - parseTest( - "(?x:a b c)# hi", concat(changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: false, - concat("a", "b", "c")), "#", " ", "h", "i") + "(?x) a (?^) b", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", + changeMatchingOptions(unsetMatchingOptions()), + " ", "b" + ) ) - parseTest( - "(?x-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xxx-x)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, .extended, removing: .extended), isIsolated: true, - concat("a", " ", "b", " ", "c")) - ) - parseTest( - "(?xx-i)a b c", changeMatchingOptions( - matchingOptions(adding: .extraExtended, removing: .caseInsensitive), isIsolated: true, - concat("a", "b", "c")) + // End of line comments aren't applicable in custom char classes. + // TODO: ICU supports this. + parseTest("(?x)[ # abc]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("#", "a", "b", "c") + )) + + parseTest("(?x)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?xx)a b c[d e f]", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", charClass("d", "e", "f") + )) + parseTest("(?x)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?-xx)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extraExtended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?xx)a b c(?-x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extraExtended)), + "a", "b", "c", + changeMatchingOptions(matchingOptions(removing: .extended)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^i)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .caseInsensitive)), + "d", " ", "e", " ", "f" + )) + parseTest("(?x)a b c(?^x)d e f", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c", + changeMatchingOptions(unsetMatchingOptions(adding: .extended)), + "d", "e", "f" + )) + parseTest("(?:(?x)a b c)d e f", concat( + nonCapture(concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b", "c" + )), + "d", " ", "e", " ", "f" + )) + parseTest("(?x:a b c)# hi", concat(changeMatchingOptions( + matchingOptions(adding: .extended), + concat("a", "b", "c")), "#", " ", "h", "i") ) - // PCRE states that whitespace seperating quantifiers is permitted under - // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 - parseTest( - "(?x)a *", + parseTest("(?x-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - zeroOrMore(of: "a")) - ) - parseTest( - "(?x)a + ?", + matchingOptions(adding: .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xxx-x)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - oneOrMore(.reluctant, of: "a")) - ) - parseTest( - "(?x)a {2,4}", + matchingOptions(adding: .extraExtended, .extended, removing: .extended) + ), + "a", " ", "b", " ", "c" + )) + parseTest("(?xx-i)a b c", concat( changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - quantRange(2 ... 4, of: "a")) - ) + matchingOptions(adding: .extraExtended, removing: .caseInsensitive) + ), + "a", "b", "c" + )) + + // PCRE states that whitespace seperating quantifiers is permitted under + // extended syntax http://pcre.org/current/doc/html/pcre2api.html#SEC20 + parseTest("(?x)a *", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + zeroOrMore(of: "a") + )) + parseTest("(?x)a + ?", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + oneOrMore(.reluctant, of: "a") + )) + parseTest("(?x)a {2,4}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + quantRange(2 ... 4, of: "a") + )) // PCRE states that whitespace won't be ignored within a range. // http://pcre.org/current/doc/html/pcre2api.html#SEC20 // TODO: We ought to warn on this, and produce a range anyway. - parseTest( - "(?x)a{1, 3}", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - concat("a", "{", "1", ",", "3", "}")) - ) + parseTest("(?x)a{1, 3}", concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "{", "1", ",", "3", "}" + )) // Test that we cover the list of whitespace characters covered by PCRE. parseTest( "(?x)a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, concat("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + "a", "b" + )) parseTest( "(?x)[a\t\u{A}\u{B}\u{C}\u{D}\u{85}\u{200E}\u{200F}\u{2028}\u{2029} b]", - changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) - ) + concat( + changeMatchingOptions(matchingOptions(adding: .extended)), + charClass("a", "b") + )) + + parseTest(#"(?i:)?"#, zeroOrOne(of: changeMatchingOptions( + matchingOptions(adding: .caseInsensitive), empty() + ))) // Test multi-line comment handling. parseTest( @@ -1843,10 +1839,10 @@ extension RegexTests { parseWithDelimitersTest("#|[a b]|#", charClass("a", "b")) parseWithDelimitersTest( - "#|(?-x)[a b]|#", changeMatchingOptions( - matchingOptions(removing: .extended), isIsolated: true, - charClass("a", " ", "b")) - ) + "#|(?-x)[a b]|#", concat( + changeMatchingOptions(matchingOptions(removing: .extended)), + charClass("a", " ", "b") + )) parseWithDelimitersTest("#|[[a ] b]|#", charClass(charClass("a"), "b")) // Non-semantic whitespace between quantifier characters for consistency @@ -1856,8 +1852,7 @@ extension RegexTests { // End-of-line comments aren't enabled by default in experimental syntax. parseWithDelimitersTest("#|#abc|#", concat("#", "a", "b", "c")) parseWithDelimitersTest("#|(?x)#abc|#", changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, - empty()) + matchingOptions(adding: .extended)) ) parseWithDelimitersTest("#|||#", alt(empty(), empty())) @@ -1913,8 +1908,7 @@ extension RegexTests { (?^) # comment /# - """, changeMatchingOptions( - unsetMatchingOptions(), isIsolated: true, empty()) + """, changeMatchingOptions(unsetMatchingOptions()) ) // (?x) has no effect. @@ -1923,8 +1917,7 @@ extension RegexTests { (?x) # comment /# - """, changeMatchingOptions( - matchingOptions(adding: .extended), isIsolated: true, empty()) + """, changeMatchingOptions(matchingOptions(adding: .extended)) ) // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter @@ -2328,17 +2321,18 @@ extension RegexTests { diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é")) diagnosticTest(#"\˂"#, .invalidEscape("˂")) - // MARK: Text Segment options + // MARK: Matching options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) diagnosticTest("(?-y{w})", .cannotRemoveTextSegmentOptions) - // MARK: Semantic Level options - diagnosticTest("(?-X)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + diagnosticTest("(?a)", .unknownGroupKind("?a")) + diagnosticTest("(?y{)", .expected("g")) + // Extended syntax may not be removed in multi-line mode. diagnosticWithDelimitersTest(""" #/ @@ -2406,6 +2400,7 @@ extension RegexTests { diagnosticTest(#"(?^-)"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-"#, .cannotRemoveMatchingOptionsAfterCaret) diagnosticTest(#"(?^i-m)"#, .cannotRemoveMatchingOptionsAfterCaret) + diagnosticTest(#"(?i)?"#, .notQuantifiable) // MARK: References @@ -2438,7 +2433,7 @@ extension RegexTests { diagnosticTest(#"(?(1)a|b|c)"#, .tooManyBranchesInConditional(3)) diagnosticTest(#"(?(1)||)"#, .tooManyBranchesInConditional(3)) - diagnosticTest(#"(?(?i))"#, .unsupportedCondition("implicitly scoped group")) + diagnosticTest(#"(?(?i))"#, .unknownGroupKind("?(")) // MARK: Callouts