From d3bd6ad9544e3bcfd7f84ad8c2afe16517bf604d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:32 +0000 Subject: [PATCH 01/17] Error on unknown escape sequences Throw an error for unknown a-z escape sequences as well as non-ASCII non-whitespace escape sequences. --- .../Regex/Parse/Diagnostics.swift | 3 ++ .../Regex/Parse/LexicalAnalysis.swift | 11 ++++- Tests/RegexTests/ParseTests.swift | 42 ++++++++++++++----- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index b9c99d9d3..d4c809045 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -39,6 +39,7 @@ enum ParseError: Error, Hashable { case expectedNonEmptyContents case expectedEscape + case invalidEscape(Character) case cannotReferToWholePattern @@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible { return "expected non-empty contents" case .expectedEscape: return "expected escape sequence" + case .invalidEscape(let c): + return "invalid escape sequence '\\\(c)'" case .cannotReferToWholePattern: return "cannot refer to whole pattern here" case .notQuantifiable: diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index cfab75312..4eb0ebea4 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -1489,8 +1489,17 @@ extension Source { return try .scalar( src.expectUnicodeScalar(escapedCharacter: char).value) default: - return .char(char) + break } + + // We only allow unknown escape sequences for non-letter ASCII, and + // non-ASCII whitespace. + guard (char.isASCII && !char.isLetter) || + (!char.isASCII && char.isWhitespace) + else { + throw ParseError.invalidEscape(char) + } + return .char(char) } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 6e511767a..69a3a785b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -544,9 +544,8 @@ extension RegexTests { #"a\Q \Q \\.\Eb"#, concat("a", quote(#" \Q \\."#), "b")) - // These follow the PCRE behavior. + // This follows the PCRE behavior. parseTest(#"\Q\\E"#, quote("\\")) - parseTest(#"\E"#, "E") parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), syntax: .experimental) @@ -566,6 +565,16 @@ extension RegexTests { parseTest(#"["-"]"#, charClass(range_m("\"", "\""))) + // MARK: Escapes + + // Not metachars, but we allow their escape as ASCII. + parseTest(#"\<"#, "<") + parseTest(#"\ "#, " ") + parseTest(#"\\"#, "\\") + + // Escaped U+3000 IDEOGRAPHIC SPACE. + parseTest(#"\\#u{3000}"#, "\u{3000}") + // MARK: Comments parseTest( @@ -989,13 +998,6 @@ extension RegexTests { // Backreferences are not valid in custom character classes. parseTest(#"[\8]"#, charClass("8")) parseTest(#"[\9]"#, charClass("9")) - parseTest(#"[\g]"#, charClass("g")) - parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0")) - parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}")) - parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'")) - - parseTest(#"\g"#, atom(.char("g"))) - parseTest(#"\k"#, atom(.char("k"))) // MARK: Character names. @@ -1526,7 +1528,7 @@ extension RegexTests { parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x")) parseWithDelimitersTest(#"re'šŸ”„šŸ‡©šŸ‡°'"#, concat("šŸ”„", "šŸ‡©šŸ‡°")) - parseWithDelimitersTest(#"re'\šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) + parseWithDelimitersTest(#"re'šŸ”„āœ…'"#, concat("šŸ”„", "āœ…")) // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) @@ -1875,6 +1877,26 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + // TODO: Custom diagnostic for expected backref + diagnosticTest(#"\g"#, .invalidEscape("g")) + diagnosticTest(#"\k"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for backref in custom char class + diagnosticTest(#"[\g]"#, .invalidEscape("g")) + diagnosticTest(#"[\g+30]"#, .invalidEscape("g")) + diagnosticTest(#"[\g{1}]"#, .invalidEscape("g")) + diagnosticTest(#"[\k'a']"#, .invalidEscape("k")) + + // TODO: Custom diagnostic for missing '\Q' + diagnosticTest(#"\E"#, .invalidEscape("E")) + + // Non-ASCII non-whitespace cases. + diagnosticTest(#"\šŸ”„"#, .invalidEscape("šŸ”„")) + diagnosticTest(#"\šŸ‡©šŸ‡°"#, .invalidEscape("šŸ‡©šŸ‡°")) + diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}")) + diagnosticTest(#"\\#u{E9}"#, .invalidEscape("Ć©")) + diagnosticTest(#"\Ė‚"#, .invalidEscape("Ė‚")) + // MARK: Text Segment options diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions) From 5a52d531097d3e67daf2c7af3a863d66aaf6388f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:32 +0000 Subject: [PATCH 02/17] Allow certain escape sequences in character class ranges Certain escape sequences express a unicode scalar and as such are valid in a range. --- Sources/_RegexParser/Regex/AST/Atom.swift | 60 ++++++++++++++++++-- Sources/_RegexParser/Regex/Parse/Parse.swift | 5 +- Tests/RegexTests/MatchTests.swift | 29 ++++++++++ Tests/RegexTests/ParseTests.swift | 31 ++++++++++ 4 files changed, 118 insertions(+), 7 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index bc346469b..0aa0951c5 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -641,17 +641,67 @@ extension AST.Atom { case .scalar(let s): return Character(s) + case .escaped(let c): + switch c { + // TODO: Should we separate these into a separate enum? Or move the + // specifics of the scalar to the DSL tree? + case .alarm: + return "\u{7}" + case .backspace: + return "\u{8}" + case .escape: + return "\u{1B}" + case .formfeed: + return "\u{C}" + case .newline: + return "\n" + case .carriageReturn: + return "\r" + case .tab: + return "\t" + + case .singleDataUnit, .decimalDigit, .notDecimalDigit, + .horizontalWhitespace, .notHorizontalWhitespace, .notNewline, + .newlineSequence, .whitespace, .notWhitespace, .verticalTab, + .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster, + .wordBoundary, .notWordBoundary, .startOfSubject, + .endOfSubjectBeforeNewline, .endOfSubject, + .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar, + .textSegment, .notTextSegment: + return nil + } + case .keyboardControl, .keyboardMeta, .keyboardMetaControl: - // TODO: Not a character per-say, what should we do? - fallthrough + // TODO: These should have unicode scalar values. + return nil - case .property, .escaped, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .namedCharacter, .callout, - .backtrackingDirective: + case .namedCharacter: + // TODO: This should have a unicode scalar value depending on the name + // given. + // TODO: Do we want to validate and assign a scalar value when building + // the AST? Or defer for the matching engine? + return nil + + case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern, + .callout, .backtrackingDirective: return nil } } + /// Whether this atom is valid as the operand of a custom character class + /// range. + public var isValidCharacterClassRangeBound: Bool { + // If we have a literal character value for this, it can be used as a bound. + if literalCharacterValue != nil { return true } + switch kind { + // \cx, \C-x, \M-x, \M-\C-x, \N{...} + case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: + return true + default: + return false + } + } + /// Produce a string literal representation of the atom, if possible /// /// Individual characters will be returned, Unicode scalars will be diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 296956fdc..4481cf602 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -489,10 +489,11 @@ extension Parser { // Range between atoms. if let (dashLoc, rhs) = try source.lexCustomCharClassRangeEnd(context: context) { - guard atom.literalCharacterValue != nil && - rhs.literalCharacterValue != nil else { + guard atom.isValidCharacterClassRangeBound && + rhs.isValidCharacterClassRangeBound else { throw ParseError.invalidCharacterClassRangeOperand } + // TODO: Validate lower <= upper? members.append(.range(.init(atom, dashLoc, rhs))) continue } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 52db17aa7..67412d262 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -594,6 +594,35 @@ extension RegexTests { firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α") + func scalar(_ u: UnicodeScalar) -> UInt32 { u.value } + + // Currently not supported in the matching engine. + for s in scalar("\u{C}") ... scalar("\u{1B}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)", + xfail: true) + } + for u: UnicodeScalar in ["\u{7}", "\u{8}"] { + firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)", + xfail: true) + } + for s in scalar("\u{A}") ... scalar("\u{D}") { + let u = UnicodeScalar(s)! + firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)", + xfail: true) + } + firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}", + xfail: true) + + for c: UnicodeScalar in ["a", "b", "c"] { + firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", + xfail: true) + } + for c: UnicodeScalar in ["$", "%", "&", "'"] { + firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, + input: "#()\(c)", match: "\(c)", xfail: true) + } + // MARK: Operators firstMatchTest( diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 69a3a785b..76327ac64 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -494,6 +494,25 @@ extension RegexTests { parseTest("[*]", charClass("*")) parseTest("[{0}]", charClass("{", "0", "}")) + parseTest(#"[\f-\e]"#, charClass( + range_m(.escaped(.formfeed), .escaped(.escape)))) + parseTest(#"[\a-\b]"#, charClass( + range_m(.escaped(.alarm), .escaped(.backspace)))) + parseTest(#"[\n-\r]"#, charClass( + range_m(.escaped(.newline), .escaped(.carriageReturn)))) + parseTest(#"[\t-\t]"#, charClass( + range_m(.escaped(.tab), .escaped(.tab)))) + + parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass( + range_m(.keyboardControl("X"), .keyboardControl("Y")), + range_m(.keyboardControl("A"), .keyboardControl("B")), + range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")), + range_m(.keyboardMeta("A"), .keyboardMeta("B")) + )) + + parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass( + range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE")))) + // MARK: Operators parseTest( @@ -575,6 +594,15 @@ extension RegexTests { // Escaped U+3000 IDEOGRAPHIC SPACE. parseTest(#"\\#u{3000}"#, "\u{3000}") + // Control and meta controls. + parseTest(#"\c "#, atom(.keyboardControl(" "))) + parseTest(#"\c!"#, atom(.keyboardControl("!"))) + parseTest(#"\c~"#, atom(.keyboardControl("~"))) + parseTest(#"\C--"#, atom(.keyboardControl("-"))) + parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a"))) + parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-"))) + parseTest(#"\M-a"#, atom(.keyboardMeta("a"))) + // MARK: Comments parseTest( @@ -1877,6 +1905,9 @@ extension RegexTests { diagnosticTest("\\", .expectedEscape) + // TODO: Custom diagnostic for control sequence + diagnosticTest(#"\c"#, .unexpectedEndOfInput) + // TODO: Custom diagnostic for expected backref diagnosticTest(#"\g"#, .invalidEscape("g")) diagnosticTest(#"\k"#, .invalidEscape("k")) From 692f0fd15bbced7f347ed8b99021d9ad45148369 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:33 +0000 Subject: [PATCH 03/17] Remove obsolete CharacterClass model computation This is now done from the DSLTree. --- .../_StringProcessing/CharacterClass.swift | 75 ------------------- 1 file changed, 75 deletions(-) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index 0b95e08b4..d44fa9fb2 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -319,21 +319,6 @@ extension CharacterClass { } } -extension AST.Node { - /// If this has a character class representation, whether built-in or custom, return it. - /// - /// TODO: Not sure if this the right model type, but I suspect we'll want to produce - /// something like this on demand - var characterClass: CharacterClass? { - switch self { - case let .customCharacterClass(cc): return cc.modelCharacterClass - case let .atom(a): return a.characterClass - - default: return nil - } - } -} - extension DSLTree.Node { var characterClass: CharacterClass? { switch self { @@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass { } } -extension AST.CustomCharacterClass { - /// The model character class for this custom character class. - var modelCharacterClass: CharacterClass? { - typealias Component = CharacterClass.CharacterSetComponent - func getComponents(_ members: [Member]) -> [Component]? { - var result = Array() - for m in members { - switch m { - case .custom(let cc): - guard let cc = cc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - case .range(let r): - result.append(.range( - r.lhs.literalCharacterValue! ... - r.rhs.literalCharacterValue!)) - - case .atom(let a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let lit = a.literalCharacterValue { - result.append(.character(lit)) - } else { - return nil - } - - case .quote(let q): - // Decompose quoted literal into literal characters. - result += q.literal.map { .character($0) } - - case .trivia: - // Not semantically important. - break - - case .setOperation(let lhs, let op, let rhs): - // FIXME: CharacterClass wasn't designed for set operations with - // multiple components in each operand, we should fix that. For now, - // just produce custom components. - guard let lhs = getComponents(lhs), - let rhs = getComponents(rhs) - else { - return nil - } - result.append(.setOperation(.init( - lhs: .characterClass(.custom(lhs)), - op: op.value, - rhs: .characterClass(.custom(rhs))))) - } - } - return result - } - guard let comps = getComponents(members) else { - return nil - } - let cc = CharacterClass.custom(comps) - return self.isInverted ? cc.inverted : cc - } -} - extension CharacterClass { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the From cdf98c5f94bf159450015cc72e675a0930b9dd36 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 21 Mar 2022 19:30:33 +0000 Subject: [PATCH 04/17] Forbid empty character classes As per PCRE, Oniguruma, and ICU, a first character of `]` is treated as literal. --- Sources/_RegexParser/Regex/Parse/Parse.swift | 6 ++++++ Tests/RegexTests/ParseTests.swift | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 4481cf602..7867073e6 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -425,6 +425,12 @@ extension Parser { try source.expectNonEmpty() var members: Array = [] + + // We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty + // character classes, and assume an initial ']' is literal. + if let loc = source.tryEatWithLoc("]") { + members.append(.atom(.init(.char("]"), loc))) + } try parseCCCMembers(into: &members) // If we have a binary set operator, parse it and the next members. Note diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 76327ac64..f6f31c075 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -428,6 +428,10 @@ extension RegexTests { parseTest("[-]", charClass("-")) + // Empty character classes are forbidden, therefore this is a character + // class of literal ']'. + parseTest("[]]", charClass("]")) + // These are metacharacters in certain contexts, but normal characters // otherwise. parseTest( @@ -1901,6 +1905,10 @@ extension RegexTests { diagnosticTest("(?")) diagnosticTest("(?", .expected(")")) + // The first ']' of a custom character class is literal, so this is missing + // the closing bracket. + diagnosticTest("[]", .expected("]")) + // MARK: Bad escapes diagnosticTest("\\", .expectedEscape) From c5ec8be4088b438f893a623fa0266b10ffaf0450 Mon Sep 17 00:00:00 2001 From: Evan Wilde Date: Wed, 30 Mar 2022 22:16:35 -0700 Subject: [PATCH 05/17] Remove extra const from gestScriptExtensions Returning a constant pointer is extraneous and leads to a bunch of warnings. Since you don't control where the pointer is assigned you can't really control whether the pointer is const or not. The uint8_t inside can be const though. --- Sources/_CUnicode/UnicodeScalarProps.c | 4 ++-- Sources/_CUnicode/include/UnicodeData.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Sources/_CUnicode/UnicodeScalarProps.c b/Sources/_CUnicode/UnicodeScalarProps.c index 9e5a8890a..81ead1421 100644 --- a/Sources/_CUnicode/UnicodeScalarProps.c +++ b/Sources/_CUnicode/UnicodeScalarProps.c @@ -68,8 +68,8 @@ uint8_t _swift_stdlib_getScript(uint32_t scalar) { } SWIFT_CC -const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar, - uint8_t *count) { +const uint8_t *_swift_stdlib_getScriptExtensions(uint32_t scalar, + uint8_t *count) { intptr_t dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar, _swift_stdlib_script_extensions, _swift_stdlib_script_extensions_ranks); diff --git a/Sources/_CUnicode/include/UnicodeData.h b/Sources/_CUnicode/include/UnicodeData.h index c9437868c..3ce6e3591 100644 --- a/Sources/_CUnicode/include/UnicodeData.h +++ b/Sources/_CUnicode/include/UnicodeData.h @@ -66,6 +66,7 @@ SWIFT_CC uint8_t _swift_stdlib_getScript(uint32_t scalar); SWIFT_CC -const uint8_t * const _swift_stdlib_getScriptExtensions(uint32_t scalar, uint8_t *count); +const uint8_t *_swift_stdlib_getScriptExtensions(uint32_t scalar, + uint8_t *count); #endif // SWIFT_STDLIB_SHIMS_UNICODEDATA_H From 0108e22cfc4ec70c2db4ac555f5e92b446b97e87 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 31 Mar 2022 10:57:38 -0600 Subject: [PATCH 06/17] DSL support for atomic groups (#238) --- Sources/RegexBuilder/DSL.swift | 12 ++ Sources/RegexBuilder/Variadics.swift | 167 ++++++++++++++++++ .../VariadicsGenerator.swift | 61 +++++++ 3 files changed, 240 insertions(+) diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 457439a43..632f1baba 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -235,6 +235,18 @@ public struct TryCapture: _BuiltinRegexComponent { // Note: Public initializers are currently gyb'd. See Variadics.swift. } +// MARK: - Groups + +/// An atomic group, i.e. opens a local backtracking scope which, upon successful exit, +/// discards any remaining backtracking points from within the scope +public struct BacktrackingScope: _BuiltinRegexComponent { + public var regex: Regex + + internal init(_ regex: Regex) { + self.regex = regex + } +} + // MARK: - Backreference public struct Reference: RegexComponent { diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift index f59b1f13a..002898dfd 100644 --- a/Sources/RegexBuilder/Variadics.swift +++ b/Sources/RegexBuilder/Variadics.swift @@ -1566,6 +1566,173 @@ extension Repeat { self.init(node: .repeating(expression.relative(to: 0..( + _ component: Component + ) where Output == Substring { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + @_disfavoredOverload + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == Substring { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0), Component.Output == (W, C0) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0), Component.Output == (W, C0) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} +extension BacktrackingScope { + public init( + _ component: Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component.regex.root)) + } +} + +extension BacktrackingScope { + public init( + @RegexComponentBuilder _ component: () -> Component + ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { + self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) + } +} extension AlternationBuilder { public static func buildPartialBlock( accumulated: R0, next: R1 diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift index ff406e9fb..23a362dad 100644 --- a/Sources/VariadicsGenerator/VariadicsGenerator.swift +++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift @@ -155,6 +155,13 @@ struct VariadicsGenerator: ParsableCommand { print(to: &standardError) } + print("Generating atomic groups...", to: &standardError) + for arity in 0...maxArity { + print(" Arity \(arity): ", terminator: "", to: &standardError) + emitAtomicGroup(arity: arity) + print(to: &standardError) + } + print("Generating alternation overloads...", to: &standardError) for (leftArity, rightArity) in Permutations(totalArity: maxArity) { print( @@ -393,6 +400,60 @@ struct VariadicsGenerator: ParsableCommand { """) } + + + func emitAtomicGroup(arity: Int) { + assert(arity >= 0) + let groupName = "BacktrackingScope" + func node(builder: Bool) -> String { + """ + .nonCapturingGroup(.atomicNonCapturing, component\( + builder ? "()" : "" + ).regex.root) + """ + } + + let disfavored = arity == 0 ? "@_disfavoredOverload\n" : "" + let genericParams: String = { + var result = "" + if arity > 0 { + result += "W" + result += (0..( + _ component: Component + ) \(whereClauseForInit) { + self.init(node: \(node(builder: false))) + } + } + + extension \(groupName) { + \(disfavored)\ + public init<\(genericParams)>( + @\(concatBuilderName) _ component: () -> Component + ) \(whereClauseForInit) { + self.init(node: \(node(builder: true))) + } + } + + """) + } + func emitRepeating(arity: Int) { assert(arity >= 0) From 692237f84af785ccb7156b68b2ae6a8a18fe4909 Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Thu, 31 Mar 2022 14:40:55 -0600 Subject: [PATCH 07/17] Rename BacktrackingScope to Local (#239) --- Sources/RegexBuilder/DSL.swift | 2 +- Sources/RegexBuilder/Variadics.swift | 44 +++++++++---------- .../VariadicsGenerator.swift | 2 +- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Sources/RegexBuilder/DSL.swift b/Sources/RegexBuilder/DSL.swift index 632f1baba..80662be41 100644 --- a/Sources/RegexBuilder/DSL.swift +++ b/Sources/RegexBuilder/DSL.swift @@ -239,7 +239,7 @@ public struct TryCapture: _BuiltinRegexComponent { /// An atomic group, i.e. opens a local backtracking scope which, upon successful exit, /// discards any remaining backtracking points from within the scope -public struct BacktrackingScope: _BuiltinRegexComponent { +public struct Local: _BuiltinRegexComponent { public var regex: Regex internal init(_ regex: Regex) { diff --git a/Sources/RegexBuilder/Variadics.swift b/Sources/RegexBuilder/Variadics.swift index 002898dfd..989e5d463 100644 --- a/Sources/RegexBuilder/Variadics.swift +++ b/Sources/RegexBuilder/Variadics.swift @@ -1566,7 +1566,7 @@ extension Repeat { self.init(node: .repeating(expression.relative(to: 0..( _ component: Component @@ -1575,7 +1575,7 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { @_disfavoredOverload public init( @RegexComponentBuilder _ component: () -> Component @@ -1583,7 +1583,7 @@ extension BacktrackingScope { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0), Component.Output == (W, C0) { @@ -1591,14 +1591,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0), Component.Output == (W, C0) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) { @@ -1606,14 +1606,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1), Component.Output == (W, C0, C1) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) { @@ -1621,14 +1621,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2), Component.Output == (W, C0, C1, C2) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) { @@ -1636,14 +1636,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3), Component.Output == (W, C0, C1, C2, C3) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) { @@ -1651,14 +1651,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4), Component.Output == (W, C0, C1, C2, C3, C4) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) { @@ -1666,14 +1666,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5), Component.Output == (W, C0, C1, C2, C3, C4, C5) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) { @@ -1681,14 +1681,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) { @@ -1696,14 +1696,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { @@ -1711,14 +1711,14 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8) { self.init(node: .nonCapturingGroup(.atomicNonCapturing, component().regex.root)) } } -extension BacktrackingScope { +extension Local { public init( _ component: Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { @@ -1726,7 +1726,7 @@ extension BacktrackingScope { } } -extension BacktrackingScope { +extension Local { public init( @RegexComponentBuilder _ component: () -> Component ) where Output == (Substring, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9), Component.Output == (W, C0, C1, C2, C3, C4, C5, C6, C7, C8, C9) { diff --git a/Sources/VariadicsGenerator/VariadicsGenerator.swift b/Sources/VariadicsGenerator/VariadicsGenerator.swift index 23a362dad..dbeff818c 100644 --- a/Sources/VariadicsGenerator/VariadicsGenerator.swift +++ b/Sources/VariadicsGenerator/VariadicsGenerator.swift @@ -404,7 +404,7 @@ struct VariadicsGenerator: ParsableCommand { func emitAtomicGroup(arity: Int) { assert(arity >= 0) - let groupName = "BacktrackingScope" + let groupName = "Local" func node(builder: Bool) -> String { """ .nonCapturingGroup(.atomicNonCapturing, component\( From 096d39d4051af8e918c9d8d77554487525ac48d7 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:51 +0100 Subject: [PATCH 08/17] Better filter trivia in dumps Make sure we don't try and print things like empty comma lists `,,,,` or redundant parens for concatenations that had their trivia filtered out. --- .../_RegexParser/Regex/Printing/DumpAST.swift | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 47142407a..0e40ad2ce 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -44,18 +44,23 @@ extension _ASTPrintable { guard let children = _children else { return _dumpBase } - let sub = children.lazy.compactMap { + let childDump = children.compactMap { child -> String? in // Exclude trivia for now, as we don't want it to appear when performing // comparisons of dumped output in tests. // TODO: We should eventually have some way of filtering out trivia for // tests, so that it can appear in regular dumps. - if $0.isTrivia { return nil } - return $0._dump() - }.joined(separator: ",") - if sub.isEmpty { - return "\(_dumpBase)" + if child.isTrivia { return nil } + let dump = child._dump() + return !dump.isEmpty ? dump : nil } - return "\(_dumpBase)(\(sub))" + let base = "\(_dumpBase)" + if childDump.isEmpty { + return base + } + if childDump.count == 1, base.isEmpty { + return "\(childDump[0])" + } + return "\(base)(\(childDump.joined(separator: ",")))" } } @@ -77,7 +82,7 @@ extension AST.Node: _ASTPrintable { } extension AST.Alternation { - public var _dumpBase: String { "alternation" } + public var _dumpBase: String { "alternation<\(children.count)>" } } extension AST.Concatenation { From c6dc547908bd3aab852e04c47286a651b31d8c00 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:51 +0100 Subject: [PATCH 09/17] Formalize non-semantic whitespace matching Turns out this is a Unicode-defined thing. --- .../Regex/Parse/LexicalAnalysis.swift | 26 ++++--------------- .../_RegexParser/Utility/MissingUnicode.swift | 6 +++++ 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4eb0ebea4..b595f3d29 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -550,28 +550,12 @@ extension Source { ) throws -> AST.Trivia? { guard context.ignoreWhitespace else { return nil } - func isWhitespace(_ c: Character) -> Bool { - // This is a list of characters that PCRE treats as whitespace when - // compiled with Unicode support. It is a subset of the characters with - // the `.isWhitespace` property. ICU appears to also follow this list. - // Oniguruma and .NET follow a subset of this list. - // - // FIXME: PCRE only treats space and tab characters as whitespace when - // inside a custom character class (and only treats whitespace as - // non-semantic there for the extra-extended `(?xx)` mode). If we get a - // strict-PCRE mode, we'll need to add a case for that. - switch c { - case " ", "\u{9}"..."\u{D}", // space, \t, \n, vertical tab, \f, \r - "\u{85}", "\u{200E}", // next line, left-to-right mark - "\u{200F}", "\u{2028}", // right-to-left-mark, line separator - "\u{2029}": // paragraph separator - return true - default: - return false - } - } + // FIXME: PCRE only treats space and tab characters as whitespace when + // inside a custom character class (and only treats whitespace as + // non-semantic there for the extra-extended `(?xx)` mode). If we get a + // strict-PCRE mode, we'll need to add a case for that. let trivia: Located? = recordLoc { src in - src.tryEatPrefix(isWhitespace)?.string + src.tryEatPrefix(\.isPatternWhitespace)?.string } guard let trivia = trivia else { return nil } return AST.Trivia(trivia) diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift index dccba3286..4d819806b 100644 --- a/Sources/_RegexParser/Utility/MissingUnicode.swift +++ b/Sources/_RegexParser/Utility/MissingUnicode.swift @@ -660,6 +660,12 @@ extension Character { public var isOctalDigit: Bool { ("0"..."7").contains(self) } public var isWordCharacter: Bool { isLetter || isNumber || self == "_" } + + /// Whether this character represents whitespace for the purposes of pattern + /// parsing. + public var isPatternWhitespace: Bool { + return unicodeScalars.first!.properties.isPatternWhitespace + } } extension UnicodeScalar { From a96648badd28106b4db723aca44b1f83fa956ffe Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 10/17] Rename endOfString -> unterminated --- .../Regex/Parse/DelimiterLexing.swift | 8 ++++---- Sources/_RegexParser/Regex/Parse/Mocking.swift | 2 +- Tests/RegexTests/ParseTests.swift | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index 1227ade1f..e88c1fa80 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -41,7 +41,7 @@ enum Delimiter: Hashable, CaseIterable { struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { - case endOfString + case unterminated case invalidUTF8 // TODO: better range reporting case unknownDelimiter case unprintableASCII @@ -59,7 +59,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { var description: String { switch kind { - case .endOfString: return "unterminated regex literal" + case .unterminated: return "unterminated regex literal" case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" case .unprintableASCII: return "unprintable ASCII character found in source file" @@ -238,7 +238,7 @@ fileprivate struct DelimiterLexer { /// the end of the buffer is reached. mutating func advance(escaped: Bool = false) throws { guard let next = load() else { - throw DelimiterLexError(.endOfString, resumeAt: cursor) + throw DelimiterLexError(.unterminated, resumeAt: cursor) } switch UnicodeScalar(next) { case let next where !next.isASCII: @@ -249,7 +249,7 @@ fileprivate struct DelimiterLexer { advanceCursor() case "\n", "\r": - throw DelimiterLexError(.endOfString, resumeAt: cursor) + throw DelimiterLexError(.unterminated, resumeAt: cursor) case "\0": // TODO: Warn to match the behavior of String literal lexer? Or should diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift index 5994a4f52..596a59bf4 100644 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift @@ -62,7 +62,7 @@ func libswiftLexRegexLiteral( curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) switch error.kind { - case .endOfString: + case .unterminated: // Missing closing delimiter can be recovered from. return false case .unprintableASCII, .invalidUTF8: diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f6f31c075..649ea22e2 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2079,21 +2079,21 @@ extension RegexTests { // MARK: Printable ASCII - delimiterLexingDiagnosticTest(#"re'\\#n'"#, .endOfString) + delimiterLexingDiagnosticTest(#"re'\\#n'"#, .unterminated) for i: UInt8 in 0x1 ..< 0x20 where i != 0xA && i != 0xD { // U+A & U+D are \n and \r. delimiterLexingDiagnosticTest("re'\(UnicodeScalar(i))'", .unprintableASCII) } - delimiterLexingDiagnosticTest("re'\n'", .endOfString) - delimiterLexingDiagnosticTest("re'\r'", .endOfString) + delimiterLexingDiagnosticTest("re'\n'", .unterminated) + delimiterLexingDiagnosticTest("re'\r'", .unterminated) delimiterLexingDiagnosticTest("re'\u{7F}'", .unprintableASCII) // MARK: Delimiter skipping - delimiterLexingDiagnosticTest("re'(?''", .endOfString) - delimiterLexingDiagnosticTest("re'(?'abc'", .endOfString) - delimiterLexingDiagnosticTest("re'(?('abc'", .endOfString) - delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .endOfString) - delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .endOfString) + delimiterLexingDiagnosticTest("re'(?''", .unterminated) + delimiterLexingDiagnosticTest("re'(?'abc'", .unterminated) + delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated) + delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated) + delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated) } func testlibswiftDiagnostics() { From 120ffc90de110ed3e2d1af382cb2f0f093e340da Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 11/17] Fix end-of-line-comment lexing Previously we would just lex to the end of the input, as it was assumed only single-line regex would be supported. Update the implementation to handle multi-line, and take account of PCRE global options. --- .../Regex/AST/MatchingOptions.swift | 3 +- .../Regex/Parse/LexicalAnalysis.swift | 26 ++- Sources/_RegexParser/Regex/Parse/Parse.swift | 13 ++ Sources/_RegexParser/Regex/Parse/Source.swift | 6 + Tests/RegexTests/ParseTests.swift | 208 ++++++++++++++++++ 5 files changed, 250 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift index 25cb10842..808b51287 100644 --- a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift +++ b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift @@ -137,7 +137,8 @@ extension AST { /// Global matching option specifiers. Unlike `MatchingOptionSequence`, /// these must appear at the start of the pattern, and apply globally. public struct GlobalMatchingOption: _ASTNode, Hashable { - /// Determines the definition of a newline for the '.' character class. + /// Determines the definition of a newline for the '.' character class and + /// when parsing end-of-line comments. public enum NewlineMatching: Hashable { /// (*CR*) case carriageReturnOnly diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index b595f3d29..165e97d1a 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -528,11 +528,27 @@ extension Source { return try src.expectQuoted(endingWith: "*/").value } if context.endOfLineComments, src.tryEat("#") { - // TODO: If we ever support multi-line regex literals, this will need - // to be updated to stop at a newline. Note though that PCRE specifies - // that the newline it matches against can be controlled by the global - // matching options e.g `(*CR)`, `(*ANY)`, ... - return src.lexUntil(\.isEmpty).value + // Try eat until we either exhaust the input, or hit a newline. Note + // that the definition of newline can be altered depending on the global + // matching options. By default we consider a newline to be `\n` or + // `\r`. + return src.lexUntil { src in + if src.isEmpty { return true } + switch context.newlineMode { + case .carriageReturnOnly: + return src.tryEat("\r") + case .linefeedOnly: + return src.tryEat("\n") + case .carriageAndLinefeedOnly: + return src.tryEat("\r\n") + case .anyCarriageReturnOrLinefeed: + return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil + case .anyUnicode: + return src.tryEat(where: \.isNewline) + case .nulCharacter: + return src.tryEat("\0") + } + }.value } return nil } diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 7867073e6..2512f9bf2 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -76,6 +76,10 @@ struct ParsingContext { /// The syntax options currently set. fileprivate(set) var syntax: SyntaxOptions + /// The current newline matching mode. + fileprivate(set) var newlineMode: AST.GlobalMatchingOption.NewlineMatching + = .anyCarriageReturnOrLinefeed + fileprivate mutating func recordGroup(_ g: AST.Group.Kind) { // TODO: Needs to track group number resets (?|...). priorGroupCount += 1 @@ -139,6 +143,15 @@ extension Parser { // First parse any global matching options if present. let opts = try source.lexGlobalMatchingOptionSequence() + // If we have a newline mode global option, update the context accordingly. + if let opts = opts { + for opt in opts.options.reversed() { + guard case .newlineMatching(let newline) = opt.kind else { continue } + context.newlineMode = newline + break + } + } + // Then parse the root AST node. let ast = try parseNode() guard source.isEmpty else { diff --git a/Sources/_RegexParser/Regex/Parse/Source.swift b/Sources/_RegexParser/Regex/Parse/Source.swift index ddf0475f3..6eac16395 100644 --- a/Sources/_RegexParser/Regex/Parse/Source.swift +++ b/Sources/_RegexParser/Regex/Parse/Source.swift @@ -68,6 +68,12 @@ extension Source { return true } + mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool { + guard let next = peek(), try pred(next) else { return false } + advance() + return true + } + mutating func tryEat(sequence c: C) -> Bool where C.Element == Char { guard _slice.starts(with: c) else { return false } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 649ea22e2..b185234a0 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1526,6 +1526,214 @@ extension RegexTests { matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b")) ) + // Test multi-line comment handling. + parseTest( + """ + # a + bc # d + ef# g + # h + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r\ + bc # d\r\ + ef# g\r\ + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r\ + bc # d\r\ + ef# g\r\ + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\r + bc # d\r + ef# g\r + # h\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\n\r\ + bc # d\n\r\ + ef# g\n\r\ + # h\n\r + """, + concat("b", "c", "e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR) + # a + bc # d + ef# g + # h + """, + ast(empty(), opts: .newlineMatching(.carriageReturnOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR)\r\ + # a\r\ + bc # d\r\ + ef# g\r\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*LF) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CRLF) + # a + bc # d + ef# g + # h + """, + ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CRLF) + # a\r + bc # d\r + ef# g\r + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a\r\ + bc # d\r\ + ef# g\r\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANYCRLF) + # a\r + bc # d\r + ef# g\r + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANY) + # a + bc # d + ef# g + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), + syntax: .extendedSyntax + ) + parseTest( + """ + # a\u{2028}\ + bc # d + ef# g\u{2028}\ + # h + """, + concat("e", "f"), + syntax: .extendedSyntax + ) + parseTest( + """ + (*ANY) + # a\u{2028}\ + bc # d\u{2028}\ + ef# g\u{2028}\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*NUL) + # a + bc # d\0\ + ef# g + # h + """, + ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*NUL) + # a\0\ + bc # d\0\ + ef# g\0\ + # h + """, + ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)), + syntax: .extendedSyntax + ) + parseTest( + """ + (*CR)(*NUL) + # a\0\ + bc # d\0\ + ef# g\0\ + # h + """, + ast(concat("b", "c", "e", "f"), + opts: .newlineMatching(.carriageReturnOnly), + .newlineMatching(.nulCharacter) + ), + syntax: .extendedSyntax + ) + // MARK: Parse with delimiters parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) From 4944fbea80d5abbbcc2bc03cc511868aebae949e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:52 +0100 Subject: [PATCH 12/17] Lex extended pound delimiters Start lexing `/.../`, and allow any number of pound signs to surround it. --- .../Regex/Parse/DelimiterLexing.swift | 152 +++++++++++++----- Tests/RegexTests/LexTests.swift | 25 +-- Tests/RegexTests/ParseTests.swift | 12 ++ 3 files changed, 143 insertions(+), 46 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index e88c1fa80..fa6ca978a 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -11,27 +11,27 @@ // TODO: mock up multi-line soon -enum Delimiter: Hashable, CaseIterable { - case traditional - case experimental - case reSingleQuote - case rxSingleQuote - - var openingAndClosing: (opening: String, closing: String) { - switch self { - case .traditional: return ("#/", "/#") - case .experimental: return ("#|", "|#") - case .reSingleQuote: return ("re'", "'") - case .rxSingleQuote: return ("rx'", "'") - } +struct Delimiter: Hashable { + let kind: Kind + let poundCount: Int + + init(_ kind: Kind, poundCount: Int) { + precondition(kind.allowsExtendedPoundSyntax || poundCount == 0) + self.kind = kind + self.poundCount = poundCount + } + + var opening: String { + String(repeating: "#", count: poundCount) + kind.opening + } + var closing: String { + kind.closing + String(repeating: "#", count: poundCount) } - var opening: String { openingAndClosing.opening } - var closing: String { openingAndClosing.closing } /// The default set of syntax options that the delimiter indicates. var defaultSyntaxOptions: SyntaxOptions { - switch self { - case .traditional, .reSingleQuote: + switch kind { + case .forwardSlash, .reSingleQuote: return .traditional case .experimental, .rxSingleQuote: return .experimental @@ -39,6 +39,37 @@ enum Delimiter: Hashable, CaseIterable { } } +extension Delimiter { + enum Kind: Hashable, CaseIterable { + case forwardSlash + case experimental + case reSingleQuote + case rxSingleQuote + + var openingAndClosing: (opening: String, closing: String) { + switch self { + case .forwardSlash: return ("/", "/") + case .experimental: return ("#|", "|#") + case .reSingleQuote: return ("re'", "'") + case .rxSingleQuote: return ("rx'", "'") + } + } + var opening: String { openingAndClosing.opening } + var closing: String { openingAndClosing.closing } + + /// Whether or not extended pound syntax e.g `##/.../##` is allowed with + /// this delimiter. + var allowsExtendedPoundSyntax: Bool { + switch self { + case .forwardSlash: + return true + case .experimental, .reSingleQuote, .rxSingleQuote: + return false + } + } + } +} + struct DelimiterLexError: Error, CustomStringConvertible { enum Kind: Hashable { case unterminated @@ -120,16 +151,25 @@ fileprivate struct DelimiterLexer { precondition(cursor <= end, "Cannot advance past end") } - /// Check to see if a UTF-8 sequence can be eaten from the current cursor. - func canEat(_ utf8: String.UTF8View) -> Bool { - guard let slice = slice(utf8.count) else { return false } - return slice.elementsEqual(utf8) + /// Check to see if a byte sequence can be eaten from the current cursor. + func canEat(_ bytes: C) -> Bool where C.Element == UInt8 { + guard let slice = slice(bytes.count) else { return false } + return slice.elementsEqual(bytes) + } + + /// Attempt to eat a byte sequence, returning `true` if successful. + mutating func tryEat( + _ bytes: C + ) -> Bool where C.Element == UInt8 { + guard canEat(bytes) else { return false } + advanceCursor(bytes.count) + return true } - /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful. - mutating func tryEat(_ utf8: String.UTF8View) -> Bool { - guard canEat(utf8) else { return false } - advanceCursor(utf8.count) + /// Attempt to eat an ascii scalar, returning `true` if successful. + mutating func tryEat(ascii s: Unicode.Scalar) -> Bool { + guard load() == ascii(s) else { return false } + advanceCursor() return true } @@ -137,8 +177,8 @@ fileprivate struct DelimiterLexer { /// the actual closing delimiter. mutating func trySkipDelimiter(_ delimiter: Delimiter) { // Only the closing `'` for re'...'/rx'...' can potentially be skipped over. - switch delimiter { - case .traditional, .experimental: + switch delimiter.kind { + case .forwardSlash, .experimental: return case .reSingleQuote, .rxSingleQuote: break @@ -272,16 +312,42 @@ fileprivate struct DelimiterLexer { } } + mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? { + for kind in Delimiter.Kind.allCases { + // If the delimiter allows extended pound syntax, or there are no pounds, + // we just need to lex it. + let opening = kind.opening.utf8 + if kind.allowsExtendedPoundSyntax || poundCount == 0 { + guard tryEat(opening) else { continue } + return Delimiter(kind, poundCount: poundCount) + } + + // The delimiter doesn't allow extended pound syntax, so the pounds must be + // part of the delimiter. + guard + poundCount < opening.count, + opening.prefix(poundCount) + .elementsEqual(repeatElement(ascii("#"), count: poundCount)), + tryEat(opening.dropFirst(poundCount)) + else { continue } + + return Delimiter(kind, poundCount: 0) + } + return nil + } + /*consuming*/ mutating func lex( ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { + // We can consume any number of pound signs. + var poundCount = 0 + while tryEat(ascii: "#") { + poundCount += 1 + } // Try to lex the opening delimiter. - guard let delimiter = Delimiter.allCases.first( - where: { tryEat($0.opening.utf8) } - ) else { + guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else { throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) } - let contentsStart = cursor while true { // Check to see if we're at a character that looks like a delimiter, but @@ -302,20 +368,34 @@ fileprivate struct DelimiterLexer { /// Drop a set of regex delimiters from the input string, returning the contents /// and the delimiter used. The input string must have valid delimiters. func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { - func stripDelimiter(_ delim: Delimiter) -> String? { + func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? { + var slice = str.utf8[...] + + // Try strip any number of opening '#'s. + var poundCount = 0 + if kind.allowsExtendedPoundSyntax { + poundCount = slice.prefix(while: { + $0 == UInt8(("#" as UnicodeScalar).value) + }).count + slice = slice.dropFirst(poundCount) + } + // The opening delimiter must match. - guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8) + guard var slice = slice.tryDropPrefix(kind.opening.utf8) else { return nil } // The closing delimiter may optionally match, as it may not be present in // invalid code. + let delim = Delimiter(kind, poundCount: poundCount) if let newSlice = slice.tryDropSuffix(delim.closing.utf8) { slice = newSlice } - return String(slice) + let result = String(decoding: slice, as: UTF8.self) + precondition(result.utf8.elementsEqual(slice)) + return (result, delim) } - for d in Delimiter.allCases { - if let contents = stripDelimiter(d) { + for kind in Delimiter.Kind.allCases { + if let (contents, d) = stripDelimiter(kind) { return (contents, d) } } diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index c50191d05..d11be6c34 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -101,26 +101,31 @@ extension RegexTests { func testCompilerInterface() { + func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter { + Delimiter(kind, poundCount: poundCount) + } let testCases: [(String, (String, Delimiter)?)] = [ - ("#/abc/#", ("abc", .traditional)), - ("#|abc|#", ("abc", .experimental)), + ("/abc/", ("abc", delim(.forwardSlash))), + ("#/abc/#", ("abc", delim(.forwardSlash, poundCount: 1))), + ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))), + ("#|abc|#", ("abc", delim(.experimental))), // TODO: Null characters are lexically valid, similar to string literals, // but we ought to warn the user about them. - ("#|ab\0c|#", ("ab\0c", .experimental)), + ("#|ab\0c|#", ("ab\0c", delim(.experimental))), ("'abc'", nil), - ("#/abc/def/#", ("abc/def", .traditional)), - ("#|abc|def|#", ("abc|def", .experimental)), - ("#/abc\\/#def/#", ("abc\\/#def", .traditional)), - ("#|abc\\|#def|#", ("abc\\|#def", .experimental)), - ("#/abc|#def/#", ("abc|#def", .traditional)), - ("#|abc/#def|#", ("abc/#def", .experimental)), + ("#/abc/def/#", ("abc/def", delim(.forwardSlash, poundCount: 1))), + ("#|abc|def|#", ("abc|def", delim(.experimental))), + ("#/abc\\/#def/#", ("abc\\/#def", delim(.forwardSlash, poundCount: 1))), + ("#|abc\\|#def|#", ("abc\\|#def", delim(.experimental))), + ("#/abc|#def/#", ("abc|#def", delim(.forwardSlash, poundCount: 1))), + ("#|abc/#def|#", ("abc/#def", delim(.experimental))), ("#/abc|#def/", nil), ("#|abc/#def#", nil), ("#/abc\n/#", nil), ("#/abc\r/#", nil), - (#"re'abcre\''"#, (#"abcre\'"#, .reSingleQuote)), + (#"re'abcre\''"#, (#"abcre\'"#, delim(.reSingleQuote))), (#"re'\'"#, nil) ] diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index b185234a0..c4f13ffd9 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1736,7 +1736,9 @@ extension RegexTests { // MARK: Parse with delimiters + parseWithDelimitersTest("/a b/", concat("a", " ", "b")) parseWithDelimitersTest("#/a b/#", concat("a", " ", "b")) + parseWithDelimitersTest("##/a b/##", concat("a", " ", "b")) parseWithDelimitersTest("#|a b|#", concat("a", "b")) parseWithDelimitersTest("re'a b'", concat("a", " ", "b")) @@ -1773,6 +1775,11 @@ extension RegexTests { // Printable ASCII characters. delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##) + // Make sure we can handle a combining accent as first character. + parseWithDelimitersTest("/\u{301}/", "\u{301}") + + delimiterLexingTest("/a/#", ignoreTrailing: true) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2302,6 +2309,11 @@ extension RegexTests { delimiterLexingDiagnosticTest("re'(?('abc'", .unterminated) delimiterLexingDiagnosticTest(#"re'\k'ab_c0+-'"#, .unterminated) delimiterLexingDiagnosticTest(#"re'\g'ab_c0+-'"#, .unterminated) + + // MARK: Unbalanced extended syntax + delimiterLexingDiagnosticTest("#/a/", .unterminated) + delimiterLexingDiagnosticTest("##/a/#", .unterminated) + } func testlibswiftDiagnostics() { From 9f42ea4ce07194030e63ec104438a0bf4d9e12bd Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:53 +0100 Subject: [PATCH 13/17] Introduce a multi-line literal mode When an extended delimiter `#/` is followed by a newline, enter a multi-line mode where the literal may span multiple lines, and extended syntax is enabled by default. --- .../Regex/Parse/DelimiterLexing.swift | 67 +++++++-- .../Regex/Parse/Diagnostics.swift | 4 + .../Regex/Parse/LexicalAnalysis.swift | 8 +- .../_RegexParser/Regex/Parse/Mocking.swift | 4 +- Sources/_RegexParser/Regex/Parse/Parse.swift | 46 +++++-- .../Regex/Parse/SyntaxOptions.swift | 5 + Tests/RegexTests/LexTests.swift | 5 + Tests/RegexTests/ParseTests.swift | 127 ++++++++++++++++++ 8 files changed, 239 insertions(+), 27 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index fa6ca978a..a9f92ade3 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -9,8 +9,6 @@ // //===----------------------------------------------------------------------===// -// TODO: mock up multi-line soon - struct Delimiter: Hashable { let kind: Kind let poundCount: Int @@ -28,13 +26,13 @@ struct Delimiter: Hashable { kind.closing + String(repeating: "#", count: poundCount) } - /// The default set of syntax options that the delimiter indicates. - var defaultSyntaxOptions: SyntaxOptions { + /// Whether or not multi-line mode is permitted. + var allowsMultiline: Bool { switch kind { - case .forwardSlash, .reSingleQuote: - return .traditional - case .experimental, .rxSingleQuote: - return .experimental + case .forwardSlash: + return poundCount > 0 + case .experimental, .reSingleQuote, .rxSingleQuote: + return false } } } @@ -76,6 +74,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case invalidUTF8 // TODO: better range reporting case unknownDelimiter case unprintableASCII + case multilineClosingNotOnNewline } var kind: Kind @@ -94,6 +93,7 @@ struct DelimiterLexError: Error, CustomStringConvertible { case .invalidUTF8: return "invalid UTF-8 found in source file" case .unknownDelimiter: return "unknown regex literal delimiter" case .unprintableASCII: return "unprintable ASCII character found in source file" + case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line" } } } @@ -103,6 +103,9 @@ fileprivate struct DelimiterLexer { var cursor: UnsafeRawPointer let end: UnsafeRawPointer + var firstNewline: UnsafeRawPointer? + var isMultiline: Bool { firstNewline != nil } + init(start: UnsafeRawPointer, end: UnsafeRawPointer) { precondition(start <= end) self.start = start @@ -262,12 +265,23 @@ fileprivate struct DelimiterLexer { let contentsEnd = cursor guard tryEat(delimiter.closing.utf8) else { return nil } - // Form a string from the contents and make sure it's valid UTF-8. let count = contentsEnd - contentsStart let contents = UnsafeRawBufferPointer( start: contentsStart, count: count) - let s = String(decoding: contents, as: UTF8.self) + // In multi-line mode, we must be on a new line. So scan backwards and make + // sure we only have whitespace until the newline. + if isMultiline { + let idx = contents.lastIndex( + where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1 + guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") }) + else { + throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor) + } + } + + // Form a string from the contents and make sure it's valid UTF-8. + let s = String(decoding: contents, as: UTF8.self) guard s.utf8.elementsEqual(contents) else { throw DelimiterLexError(.invalidUTF8, resumeAt: cursor) } @@ -278,7 +292,10 @@ fileprivate struct DelimiterLexer { /// the end of the buffer is reached. mutating func advance(escaped: Bool = false) throws { guard let next = load() else { - throw DelimiterLexError(.unterminated, resumeAt: cursor) + // We've hit the end of the buffer. In multi-line mode, we don't want to + // skip over what is likely otherwise valid Swift code, so resume from the + // first newline. + throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor) } switch UnicodeScalar(next) { case let next where !next.isASCII: @@ -289,7 +306,10 @@ fileprivate struct DelimiterLexer { advanceCursor() case "\n", "\r": - throw DelimiterLexError(.unterminated, resumeAt: cursor) + guard isMultiline else { + throw DelimiterLexError(.unterminated, resumeAt: cursor) + } + advanceCursor() case "\0": // TODO: Warn to match the behavior of String literal lexer? Or should @@ -301,8 +321,12 @@ fileprivate struct DelimiterLexer { advanceCursor() try advance(escaped: true) - case let next where !next.isPrintableASCII: + case let next + where !next.isPrintableASCII && !(isMultiline && next == "\t"): // Diagnose unprintable ASCII. + // Note that tabs are allowed in multi-line literals. + // TODO: This matches the string literal behavior, but should we allow + // tabs for single-line regex literals too? // TODO: Ideally we would recover and continue to lex until the ending // delimiter. throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor()) @@ -349,6 +373,23 @@ fileprivate struct DelimiterLexer { throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor()) } let contentsStart = cursor + + // If the delimiter allows multi-line, try skipping over any whitespace to a + // newline character. If we can do that, we enter multi-line mode. + if delimiter.allowsMultiline { + while let next = load() { + switch next { + case ascii(" "), ascii("\t"): + advanceCursor() + continue + case ascii("\n"), ascii("\r"): + firstNewline = cursor + default: + break + } + break + } + } while true { // Check to see if we're at a character that looks like a delimiter, but // likely isn't. In such a case, we can attempt to skip over it. diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index d4c809045..621d6ea11 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -70,6 +70,8 @@ enum ParseError: Error, Hashable { case cannotRemoveTextSegmentOptions case cannotRemoveSemanticsOptions + case cannotRemoveExtendedSyntaxInMultilineMode + case expectedCalloutArgument } @@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible { return "text segment mode cannot be unset, only changed" case .cannotRemoveSemanticsOptions: return "semantic level cannot be unset, only changed" + case .cannotRemoveExtendedSyntaxInMultilineMode: + return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" } diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 165e97d1a..c48d53de9 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -657,6 +657,7 @@ extension Source { /// | MatchingOption* '-' MatchingOption* /// mutating func lexMatchingOptionSequence( + context: ParsingContext ) throws -> AST.MatchingOptionSequence? { let ateCaret = recordLoc { $0.tryEat("^") } @@ -691,6 +692,11 @@ extension Source { if opt.isSemanticMatchingLevel { throw ParseError.cannotRemoveSemanticsOptions } + // Extended syntax may not be removed if in multi-line mode. + if context.syntax.contains(.multilineExtendedSyntax) && + opt.isAnyExtended { + throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode + } removing.append(opt) } return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location, @@ -864,7 +870,7 @@ extension Source { } // Matching option changing group (?iJmnsUxxxDPSWy{..}-iJmnsUxxxDPSW:). - if let seq = try src.lexMatchingOptionSequence() { + if let seq = try src.lexMatchingOptionSequence(context: context) { if src.tryEat(":") { return .changeMatchingOptions(seq, isIsolated: false) } diff --git a/Sources/_RegexParser/Regex/Parse/Mocking.swift b/Sources/_RegexParser/Regex/Parse/Mocking.swift index 596a59bf4..dd02e0fc7 100644 --- a/Sources/_RegexParser/Regex/Parse/Mocking.swift +++ b/Sources/_RegexParser/Regex/Parse/Mocking.swift @@ -62,8 +62,8 @@ func libswiftLexRegexLiteral( curPtrPtr.pointee = error.resumePtr.assumingMemoryBound(to: CChar.self) switch error.kind { - case .unterminated: - // Missing closing delimiter can be recovered from. + case .unterminated, .multilineClosingNotOnNewline: + // These can be recovered from. return false case .unprintableASCII, .invalidUTF8: // We don't currently have good recovery behavior for these. diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift index 2512f9bf2..c3aa3500b 100644 --- a/Sources/_RegexParser/Regex/Parse/Parse.swift +++ b/Sources/_RegexParser/Regex/Parse/Parse.swift @@ -288,22 +288,25 @@ extension Parser { ) throws -> AST.Group { context.recordGroup(kind.value) - // Check if we're introducing or removing extended syntax. + // Check if we're introducing or removing extended syntax. We skip this for + // multi-line, as extended syntax is always enabled there. // TODO: PCRE differentiates between (?x) and (?xx) where only the latter // handles non-semantic whitespace in a custom character class. Other // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore, // treat (?x) and (?xx) as the same option here. If we ever get a strict // PCRE mode, we will need to change this to handle that. let currentSyntax = context.syntax - if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { - if c.resetsCurrentOptions { - context.syntax.remove(.extendedSyntax) - } - if c.adding.contains(where: \.isAnyExtended) { - context.syntax.insert(.extendedSyntax) - } - if c.removing.contains(where: \.isAnyExtended) { - context.syntax.remove(.extendedSyntax) + if !context.syntax.contains(.multilineExtendedSyntax) { + if case .changeMatchingOptions(let c, isIsolated: _) = kind.value { + if c.resetsCurrentOptions { + context.syntax.remove(.extendedSyntax) + } + if c.adding.contains(where: \.isAnyExtended) { + context.syntax.insert(.extendedSyntax) + } + if c.removing.contains(where: \.isAnyExtended) { + context.syntax.remove(.extendedSyntax) + } } } defer { @@ -532,11 +535,32 @@ public func parse( return try parser.parse() } +/// Retrieve the default set of syntax options that a delimiter and literal +/// contents indicates. +fileprivate func defaultSyntaxOptions( + _ delim: Delimiter, contents: String +) -> SyntaxOptions { + switch delim.kind { + case .forwardSlash: + // For an extended syntax forward slash e.g #/.../#, extended syntax is + // permitted if it spans multiple lines. + if delim.poundCount > 0 && + contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) { + return .multilineExtendedSyntax + } + return .traditional + case .reSingleQuote: + return .traditional + case .experimental, .rxSingleQuote: + return .experimental + } +} + /// Parse a given regex string with delimiters, inferring the syntax options /// from the delimiter used. public func parseWithDelimiters( _ regex: S ) throws -> AST where S.SubSequence == Substring { let (contents, delim) = droppingRegexDelimiters(String(regex)) - return try parse(contents, delim.defaultSyntaxOptions) + return try parse(contents, defaultSyntaxOptions(delim, contents: contents)) } diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift index 5135d8ec1..b7c09ea1c 100644 --- a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift +++ b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift @@ -58,6 +58,11 @@ public struct SyntaxOptions: OptionSet { /// `(_: .*)` == `(?:.*)` public static var experimentalCaptures: Self { Self(1 << 5) } + /// The default syntax for a multi-line regex literal. + public static var multilineExtendedSyntax: Self { + return [Self(1 << 6), .extendedSyntax] + } + /* /// `*` == `[[:digit:]]*` == `\d*` diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index d11be6c34..5c304fe58 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -110,6 +110,11 @@ extension RegexTests { ("###/abc/###", ("abc", delim(.forwardSlash, poundCount: 3))), ("#|abc|#", ("abc", delim(.experimental))), + // Multiline + ("#/\na\nb\n/#", ("\na\nb\n", delim(.forwardSlash, poundCount: 1))), + ("#/ \na\nb\n /#", (" \na\nb\n ", delim(.forwardSlash, poundCount: 1))), + ("##/ \na\nb\n /##", (" \na\nb\n ", delim(.forwardSlash, poundCount: 2))), + // TODO: Null characters are lexically valid, similar to string literals, // but we ought to warn the user about them. ("#|ab\0c|#", ("ab\0c", delim(.experimental))), diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c4f13ffd9..c40cb86ca 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -223,6 +223,36 @@ func diagnosticTest( } } +func diagnosticWithDelimitersTest( + _ input: String, _ expected: ParseError, ignoreTrailing: Bool = false, + file: StaticString = #file, line: UInt = #line +) { + // First try lexing. + let literal = delimiterLexingTest( + input, ignoreTrailing: ignoreTrailing, file: file, line: line) + + do { + let orig = try parseWithDelimiters(literal) + let ast = orig.root + XCTFail(""" + + Passed \(ast) + But expected error: \(expected) + """, file: file, line: line) + } catch let e as Source.LocatedError { + guard e.error == expected else { + XCTFail(""" + + Expected: \(expected) + Actual: \(e.error) + """, file: file, line: line) + return + } + } catch let e { + XCTFail("Error without source location: \(e)", file: file, line: line) + } +} + func delimiterLexingDiagnosticTest( _ input: String, _ expected: DelimiterLexError.Kind, syntax: SyntaxOptions = .traditional, @@ -1403,6 +1433,18 @@ extension RegexTests { parseTest("(?xx) \\ ", changeMatchingOptions(matchingOptions( adding: .extraExtended), isIsolated: true, concat(" "))) + parseTest( + "(?x) a (?^) b", + changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, + concat( + "a", + changeMatchingOptions( + unsetMatchingOptions(), isIsolated: true, concat(" ", "b")) + ) + ) + ) + // End of line comments aren't applicable in custom char classes. // TODO: ICU supports this. parseTest( @@ -1780,6 +1822,56 @@ extension RegexTests { delimiterLexingTest("/a/#", ignoreTrailing: true) + // MARK: Multiline + + parseWithDelimitersTest("#/\n/#", empty()) + parseWithDelimitersTest("#/\r/#", empty()) + parseWithDelimitersTest("#/\r\n/#", empty()) + parseWithDelimitersTest("#/\n\t\t /#", empty()) + parseWithDelimitersTest("#/ \t\t\n\t\t /#", empty()) + + parseWithDelimitersTest("#/\n a \n/#", "a") + parseWithDelimitersTest("#/\r a \r/#", "a") + parseWithDelimitersTest("#/\r\n a \r\n/#", "a") + parseWithDelimitersTest("#/\n a \n\t\t /#", "a") + parseWithDelimitersTest("#/\t \n a \n\t\t /#", "a") + + parseWithDelimitersTest(""" + #/ + a + b + c + /# + """, concat("a", "b", "c")) + + parseWithDelimitersTest(""" + #/ + a # comment + b # another + # + /# + """, concat("a", "b")) + + // Make sure (?^) is ignored. + parseWithDelimitersTest(""" + #/ + (?^) + # comment + /# + """, changeMatchingOptions( + unsetMatchingOptions(), isIsolated: true, empty()) + ) + + // (?x) has no effect. + parseWithDelimitersTest(""" + #/ + (?x) + # comment + /# + """, changeMatchingOptions( + matchingOptions(adding: .extended), isIsolated: true, empty()) + ) + // MARK: Delimiter skipping: Make sure we can skip over the ending delimiter // if it's clear that it's part of the regex syntax. @@ -2162,6 +2254,32 @@ extension RegexTests { diagnosticTest("(?-u)", .cannotRemoveSemanticsOptions) diagnosticTest("(?-b)", .cannotRemoveSemanticsOptions) + // Extended syntax may not be removed in multi-line mode. + diagnosticWithDelimitersTest(""" + #/ + (?-x)a b + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx)a b + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-x:a b) + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + diagnosticWithDelimitersTest(""" + #/ + (?-xx:a b) + /# + """, .cannotRemoveExtendedSyntaxInMultilineMode + ) + // MARK: Group specifiers diagnosticTest(#"(*"#, .unknownGroupKind("*")) @@ -2314,6 +2432,15 @@ extension RegexTests { delimiterLexingDiagnosticTest("#/a/", .unterminated) delimiterLexingDiagnosticTest("##/a/#", .unterminated) + // MARK: Multiline + + // Can only be done if pound signs are used. + delimiterLexingDiagnosticTest("/\n/", .unterminated) + + // Opening and closing delimiters must be on a newline. + delimiterLexingDiagnosticTest("#/a\n/#", .unterminated) + delimiterLexingDiagnosticTest("#/\na/#", .multilineClosingNotOnNewline) + delimiterLexingDiagnosticTest("#/\n#/#", .multilineClosingNotOnNewline) } func testlibswiftDiagnostics() { From 556bca0abb2bd1623664481f9aa31be0ed19af1f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 14:34:53 +0100 Subject: [PATCH 14/17] Disable unused delimiters Leave only `/.../` (and its extended syntax) enabled for now. --- .../Regex/Parse/DelimiterLexing.swift | 21 ++++++++++++---- Tests/RegexTests/LexTests.swift | 24 +++++++++++++++++-- Tests/RegexTests/ParseTests.swift | 6 +++-- 3 files changed, 42 insertions(+), 9 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift index a9f92ade3..bee782043 100644 --- a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift +++ b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift @@ -35,6 +35,12 @@ struct Delimiter: Hashable { return false } } + + /// The delimiters which are currently enabled. + static var enabledDelimiters: [Kind] { [.forwardSlash] } + + /// All known delimiters. + static var allDelimiters: [Kind] { Kind.allCases } } extension Delimiter { @@ -106,11 +112,15 @@ fileprivate struct DelimiterLexer { var firstNewline: UnsafeRawPointer? var isMultiline: Bool { firstNewline != nil } - init(start: UnsafeRawPointer, end: UnsafeRawPointer) { + let delimiters: [Delimiter.Kind] + + init(start: UnsafeRawPointer, end: UnsafeRawPointer, + delimiters: [Delimiter.Kind]) { precondition(start <= end) self.start = start self.cursor = start self.end = end + self.delimiters = delimiters } func ascii(_ s: Unicode.Scalar) -> UInt8 { @@ -337,7 +347,7 @@ fileprivate struct DelimiterLexer { } mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? { - for kind in Delimiter.Kind.allCases { + for kind in delimiters { // If the delimiter allows extended pound syntax, or there are no pounds, // we just need to lex it. let opening = kind.opening.utf8 @@ -435,7 +445,7 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { precondition(result.utf8.elementsEqual(slice)) return (result, delim) } - for kind in Delimiter.Kind.allCases { + for kind in Delimiter.allDelimiters { if let (contents, d) = stripDelimiter(kind) { return (contents, d) } @@ -446,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) { /// Attempt to lex a regex literal between `start` and `end`, returning either /// the contents and pointer from which to resume lexing, or an error. func lexRegex( - start: UnsafeRawPointer, end: UnsafeRawPointer + start: UnsafeRawPointer, end: UnsafeRawPointer, + delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) { - var lexer = DelimiterLexer(start: start, end: end) + var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters) return try lexer.lex() } diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift index 5c304fe58..958c53c26 100644 --- a/Tests/RegexTests/LexTests.swift +++ b/Tests/RegexTests/LexTests.swift @@ -100,7 +100,7 @@ extension RegexTests { } - func testCompilerInterface() { + func testCompilerInterface() throws { func delim(_ kind: Delimiter.Kind, poundCount: Int = 0) -> Delimiter { Delimiter(kind, poundCount: poundCount) } @@ -138,7 +138,9 @@ extension RegexTests { input.withCString { let endPtr = $0 + input.utf8.count assert(endPtr.pointee == 0) - guard let out = try? lexRegex(start: $0, end: endPtr) else { + guard let out = try? lexRegex( + start: $0, end: endPtr, delimiters: Delimiter.allDelimiters) + else { XCTAssertNil(expected) return } @@ -150,5 +152,23 @@ extension RegexTests { XCTAssertEqual(expected?.1, droppedDelimiters.1) } } + + // TODO: Remove the lexing code for these if we no longer need them. + let disabledDelimiters: [String] = [ + "#|x|#", "re'x'", "rx'y'" + ] + + for input in disabledDelimiters { + try input.withCString { + let endPtr = $0 + input.utf8.count + assert(endPtr.pointee == 0) + do { + _ = try lexRegex(start: $0, end: endPtr) + XCTFail() + } catch let e as DelimiterLexError { + XCTAssertEqual(e.kind, .unknownDelimiter) + } + } + } } } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index c40cb86ca..c6ff3e46d 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -117,7 +117,8 @@ func delimiterLexingTest( ) -> String { input.withCString(encodedAs: UTF8.self) { ptr in let endPtr = ptr + input.utf8.count - let (contents, delim, end) = try! lexRegex(start: ptr, end: endPtr) + let (contents, delim, end) = try! lexRegex( + start: ptr, end: endPtr, delimiters: Delimiter.allDelimiters) if ignoreTrailing { XCTAssertNotEqual(end, endPtr, file: file, line: line) } else { @@ -260,7 +261,8 @@ func delimiterLexingDiagnosticTest( ) { do { _ = try input.withCString { ptr in - try lexRegex(start: ptr, end: ptr + input.count) + try lexRegex( + start: ptr, end: ptr + input.count, delimiters: Delimiter.allDelimiters) } XCTFail(""" Passed, but expected error: \(expected) From 820ab38300b6eff0d52e569612cc35ee8849741d Mon Sep 17 00:00:00 2001 From: Michael Ilseman Date: Fri, 1 Apr 2022 09:12:52 -0600 Subject: [PATCH 15/17] Regex Type and Overview V2 and accompanying tests/changes (#241) * Clarify contractions * Motivation tests, API updates, and text --- Documentation/Evolution/RegexSyntax.md | 33 ++- Documentation/Evolution/RegexTypeOverview.md | 58 ++-- .../Participants/RegexParticipant.swift | 4 +- Sources/RegexBuilder/Match.swift | 20 +- .../Algorithms/Consumers/RegexConsumer.swift | 2 +- .../Regex/AnyRegexOutput.swift | 13 +- Sources/_StringProcessing/Regex/Core.swift | 76 ++--- Sources/_StringProcessing/Regex/Match.swift | 91 +++++- Tests/RegexBuilderTests/CustomTests.swift | 2 +- Tests/RegexBuilderTests/MotivationTests.swift | 267 ++++++++++++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 64 ++--- Tests/RegexTests/AlgorithmsTests.swift | 8 +- 12 files changed, 524 insertions(+), 114 deletions(-) create mode 100644 Tests/RegexBuilderTests/MotivationTests.swift diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 5bfdd5e8a..958d52f7d 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -2,27 +2,38 @@ Hello, we want to issue an update to [Regular Expression Literals](https://forums.swift.org/t/pitch-regular-expression-literals/52820) and prepare for a formal proposal. The great delimiter deliberation continues to unfold, so in the meantime, we have a significant amount of surface area to present for review/feedback: the syntax _inside_ a regex literal. Additionally, this is the syntax accepted from a string used for run-time regex construction, so we're devoting an entire pitch/proposal to the topic of _regex syntax_, distinct from the result builder DSL or the choice of delimiters for literals. --> -# Regex Syntax +# Run-time Regex Construction -- Authors: Hamish Knight, Michael Ilseman +- Authors: [Hamish Knight](https://github.com/hamishknight), [Michael Ilseman](https://github.com/milseman) ## Introduction -A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. Regexes can be created from a string at run time or from a literal at compile time. The contents of that run-time string, or the contents in-between the compile-time literal's delimiters, uses regex syntax. We present a detailed and comprehensive treatment of regex syntax. - -This is part of a larger effort in supporting regex literals, which in turn is part of a larger effort towards better string processing using regex. See [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107), which tracks each relevant piece. This proposal regards _syntactic_ support, and does not necessarily mean that everything that can be written will be supported by Swift's runtime engine in the initial release. Support for more obscure features may appear over time, see [MatchingEngine Capabilities and Roadmap](https://github.com/apple/swift-experimental-string-processing/issues/99) for status. +A regex declares a string processing algorithm using syntax familiar across a variety of languages and tools throughout programming history. We propose the ability to create a regex at run time from a string containing regex syntax (detailed here), API for accessing the match and captures, and a means to convert between an existential capture representation and concrete types. +The overall story is laid out in [Regex Type and Overview](https://github.com/apple/swift-experimental-string-processing/blob/main/Documentation/Evolution/RegexTypeOverview.md) and each individual component is tracked in [Pitch and Proposal Status](https://github.com/apple/swift-experimental-string-processing/issues/107). ## Motivation Swift aims to be a pragmatic programming language, striking a balance between familiarity, interoperability, and advancing the art. Swift's `String` presents a uniquely Unicode-forward model of string, but currently suffers from limited processing facilities. + + The full string processing effort includes a regex type with strongly typed captures, the ability to create a regex from a string at runtime, a compile-time literal, a result builder DSL, protocols for intermixing 3rd party industrial-strength parsers with regex declarations, and a slew of regex-powered algorithms over strings. This proposal specifically hones in on the _familiarity_ aspect by providing a best-in-class treatment of familiar regex syntax. ## Proposed Solution + + +### Syntax + We propose accepting a syntactic "superset" of the following existing regular expression engines: - [PCRE 2][pcre2-syntax], an "industry standard" and a rough superset of Perl, Python, etc. @@ -40,6 +51,10 @@ Regex syntax will be part of Swift's source-compatibility story as well as its b ## Detailed Design + + We propose the following syntax for regex.
Grammar Notation @@ -832,6 +847,14 @@ Regex syntax will become part of Swift's source and binary-compatibility story, Even though it is more work up-front and creates a longer proposal, it is less risky to support the full intended syntax. The proposed superset maximizes the familiarity benefit of regex syntax. + + [pcre2-syntax]: https://www.pcre.org/current/doc/html/pcre2syntax.html [oniguruma-syntax]: https://github.com/kkos/oniguruma/blob/master/doc/RE [icu-syntax]: https://unicode-org.github.io/icu/userguide/strings/regexp.html diff --git a/Documentation/Evolution/RegexTypeOverview.md b/Documentation/Evolution/RegexTypeOverview.md index 6fe3bc0bf..504111181 100644 --- a/Documentation/Evolution/RegexTypeOverview.md +++ b/Documentation/Evolution/RegexTypeOverview.md @@ -149,11 +149,11 @@ Type mismatches and invalid regex syntax are diagnosed at construction time by ` When the pattern is known at compile time, regexes can be created from a literal containing the same regex syntax, allowing the compiler to infer the output type. Regex literals enable source tools, e.g. syntax highlighting and actions to refactor into a result builder equivalent. ```swift -let regex = re'(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)' +let regex = /(\w+)\s\s+(\S+)\s\s+((?:(?!\s\s).)*)\s\s+(.*)/ // regex: Regex<(Substring, Substring, Substring, Substring, Substring)> ``` -*Note*: Regex literals, most notably the choice of delimiter, are discussed in [Regex Literals][pitches]. For this example, I used the less technically-problematic option of `re'...'`. +*Note*: Regex literals, most notably the choice of delimiter, are discussed in [Regex Literals][pitches]. This same regex can be created from a result builder, a refactoring-friendly representation: @@ -193,13 +193,13 @@ A `Regex.Match` contains the result of a match, surfacing captures by nu ```swift func processEntry(_ line: String) -> Transaction? { - let regex = re''' - (?x) # Ignore whitespace and comments + // Multiline literal implies `(?x)`, i.e. non-semantic whitespace with line-ending comments + let regex = #/ (? \w+) \s\s+ (? \S+) \s\s+ (? (?: (?!\s\s) . )+) \s\s+ (? .*) - ''' + /# // regex: Regex<( // Substring, // kind: Substring, @@ -291,7 +291,7 @@ A regex describes an algorithm to be ran over some model of string, and Swift's Calling `dropFirst()` will not drop a leading byte or `Unicode.Scalar`, but rather a full `Character`. Similarly, a `.` in a regex will match any extended grapheme cluster. A regex will match canonical equivalents by default, strengthening the connection between regex and the equivalent `String` operations. -Additionally, word boundaries (`\b`) follow [UTS\#29 Word Boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries), meaning contractions ("don't") and script changes are detected and separated, without incurring significant binary size costs associated with language dictionaries. +Additionally, word boundaries (`\b`) follow [UTS\#29 Word Boundaries](https://www.unicode.org/reports/tr29/#Word_Boundaries). Contractions ("don't") are correctly detected and script changes are separated, without incurring significant binary size costs associated with language dictionaries. Regex targets [UTS\#18 Level 2](https://www.unicode.org/reports/tr18/#Extended_Unicode_Support) by default, but provides options to switch to scalar-level processing as well as compatibility character classes. Detailed rules on how we infer necessary grapheme cluster breaks inside regexes, as well as options and other concerns, are discussed in [Unicode for String Processing][pitches]. @@ -300,18 +300,47 @@ Regex targets [UTS\#18 Level 2](https://www.unicode.org/reports/tr18/#Extended_U ```swift /// A regex represents a string processing algorithm. +/// +/// let regex = try Regex(compiling: "a(.*)b") +/// let match = "cbaxb".firstMatch(of: regex) +/// print(match.0) // "axb" +/// print(match.1) // "x" +/// public struct Regex { /// Match a string in its entirety. /// /// Returns `nil` if no match and throws on abort - public func matchWhole(_: String) throws -> Match? + public func matchWhole(_ s: String) throws -> Regex.Match? - /// Match at the front of a string + /// Match part of the string, starting at the beginning. /// /// Returns `nil` if no match and throws on abort - public func matchFront(_: String) throws -> Match? + public func matchPrefix(_ s: String) throws -> Regex.Match? + + /// Find the first match in a string + /// + /// Returns `nil` if no match is found and throws on abort + public func firstMatch(in s: String) throws -> Regex.Match? + + /// Match a substring in its entirety. + /// + /// Returns `nil` if no match and throws on abort + public func matchWhole(_ s: Substring) throws -> Regex.Match? + + /// Match part of the string, starting at the beginning. + /// + /// Returns `nil` if no match and throws on abort + public func matchPrefix(_ s: Substring) throws -> Regex.Match? + + /// Find the first match in a substring + /// + /// Returns `nil` if no match is found and throws on abort + public func firstMatch(_ s: Substring) throws -> Regex.Match? /// The result of matching a regex against a string. + /// + /// A `Match` forwards API to the `Output` generic parameter, + /// providing direct access to captures. @dynamicMemberLookup public struct Match { /// The range of the overall match @@ -320,7 +349,7 @@ public struct Regex { /// The produced output from the match operation public var output: Output - /// Lookup a capture by number + /// Lookup a capture by name or number public subscript(dynamicMember keyPath: KeyPath) -> T /// Lookup a capture by number @@ -342,11 +371,6 @@ public struct Regex { extension Regex: RegexComponent { public var regex: Regex { self } - /// Create a regex out of a single component - public init( - _ content: Content - ) where Content.Output == Output - /// Result builder interface public init( @RegexComponentBuilder _ content: () -> Content @@ -360,11 +384,11 @@ extension Regex.Match { // Run-time compilation interfaces extension Regex { - /// Parse and compile `pattern`. + /// Parse and compile `pattern`, resulting in a strongly-typed capture list. public init(compiling pattern: String, as: Output.Type = Output.self) throws } extension Regex where Output == AnyRegexOutput { - /// Parse and compile `pattern`. + /// Parse and compile `pattern`, resulting in an existentially-typed capture list. public init(compiling pattern: String) throws } ``` diff --git a/Sources/Exercises/Participants/RegexParticipant.swift b/Sources/Exercises/Participants/RegexParticipant.swift index 731b9b6f6..a40de3953 100644 --- a/Sources/Exercises/Participants/RegexParticipant.swift +++ b/Sources/Exercises/Participants/RegexParticipant.swift @@ -63,7 +63,7 @@ private func graphemeBreakPropertyData( forLine line: String, using regex: RP ) -> GraphemeBreakEntry? where RP.Output == (Substring, Substring, Substring?, Substring) { - line.match(regex).map(\.output).flatMap(extractFromCaptures) + line.matchWhole(regex).map(\.output).flatMap(extractFromCaptures) } private func graphemeBreakPropertyDataLiteral( @@ -80,7 +80,7 @@ private func graphemeBreakPropertyDataLiteral( private func graphemeBreakPropertyData( forLine line: String ) -> GraphemeBreakEntry? { - line.match { + line.matchWhole { TryCapture(OneOrMore(.hexDigit)) { Unicode.Scalar(hex: $0) } Optionally { ".." diff --git a/Sources/RegexBuilder/Match.swift b/Sources/RegexBuilder/Match.swift index 3f86f9498..ac07ec0b8 100644 --- a/Sources/RegexBuilder/Match.swift +++ b/Sources/RegexBuilder/Match.swift @@ -12,17 +12,29 @@ import _StringProcessing extension String { - public func match( + public func matchWhole( @RegexComponentBuilder _ content: () -> R ) -> Regex.Match? { - match(content()) + matchWhole(content()) + } + + public func matchPrefix( + @RegexComponentBuilder _ content: () -> R + ) -> Regex.Match? { + matchPrefix(content()) } } extension Substring { - public func match( + public func matchWhole( + @RegexComponentBuilder _ content: () -> R + ) -> Regex.Match? { + matchWhole(content()) + } + + public func matchPrefix( @RegexComponentBuilder _ content: () -> R ) -> Regex.Match? { - match(content()) + matchPrefix(content()) } } diff --git a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift index 4dac3cef5..3ab1e579d 100644 --- a/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift +++ b/Sources/_StringProcessing/Algorithms/Consumers/RegexConsumer.swift @@ -24,7 +24,7 @@ extension RegexConsumer { func _matchingConsuming( _ consumed: Substring, in range: Range ) -> (upperBound: String.Index, match: Match)? { - guard let result = regex._match( + guard let result = try! regex._match( consumed.base, in: range, mode: .partialFromFront ) else { return nil } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 2f99470fc..cac0e46c3 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -12,7 +12,18 @@ import _RegexParser extension Regex where Output == AnyRegexOutput { - public init(_ pattern: String) throws { + /// Parse and compile `pattern`, resulting in an existentially-typed capture list. + public init(compiling pattern: String) throws { + self.init(ast: try parse(pattern, .traditional)) + } +} + +extension Regex { + /// Parse and compile `pattern`, resulting in a strongly-typed capture list. + public init( + compiling pattern: String, + as: Output.Type = Output.self + ) throws { self.init(ast: try parse(pattern, .traditional)) } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 29d18ef22..265a7868c 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -18,8 +18,48 @@ public protocol RegexComponent { var regex: Regex { get } } -/// A regular expression. +/// A regex represents a string processing algorithm. +/// +/// let regex = try Regex(compiling: "a(.*)b") +/// let match = "cbaxb".firstMatch(of: regex) +/// print(match.0) // "axb" +/// print(match.1) // "x" +/// public struct Regex: RegexComponent { + let program: Program + + var hasCapture: Bool { + program.tree.hasCapture + } + + init(ast: AST) { + self.program = Program(ast: ast) + } + init(ast: AST.Node) { + self.program = Program(ast: .init(ast, globalOptions: nil)) + } + + // Compiler interface. Do not change independently. + @usableFromInline + init(_regexString pattern: String) { + self.init(ast: try! parse(pattern, .traditional)) + } + + // Compiler interface. Do not change independently. + @usableFromInline + init(_regexString pattern: String, version: Int) { + assert(version == currentRegexLiteralFormatVersion) + // The version argument is passed by the compiler using the value defined + // in libswiftParseRegexLiteral. + self.init(ast: try! parseWithDelimiters(pattern)) + } + + public var regex: Regex { + self + } +} + +extension Regex { /// A program representation that caches any lowered representation for /// execution. internal class Program { @@ -41,49 +81,19 @@ public struct Regex: RegexComponent { self.tree = tree } } +} - let program: Program -// var ast: AST { program.ast } - +extension Regex { @_spi(RegexBuilder) public var root: DSLTree.Node { program.tree.root } - var hasCapture: Bool { - program.tree.hasCapture - } - - init(ast: AST) { - self.program = Program(ast: ast) - } - init(ast: AST.Node) { - self.program = Program(ast: .init(ast, globalOptions: nil)) - } - @_spi(RegexBuilder) public init(node: DSLTree.Node) { self.program = Program(tree: .init(node, options: nil)) } - // Compiler interface. Do not change independently. - @usableFromInline - init(_regexString pattern: String) { - self.init(ast: try! parse(pattern, .traditional)) - } - - // Compiler interface. Do not change independently. - @usableFromInline - init(_regexString pattern: String, version: Int) { - assert(version == currentRegexLiteralFormatVersion) - // The version argument is passed by the compiler using the value defined - // in libswiftParseRegexLiteral. - self.init(ast: try! parseWithDelimiters(pattern)) - } - - public var regex: Regex { - self - } } // MARK: - Primitive regex components diff --git a/Sources/_StringProcessing/Regex/Match.swift b/Sources/_StringProcessing/Regex/Match.swift index f3776b761..45d33f03e 100644 --- a/Sources/_StringProcessing/Regex/Match.swift +++ b/Sources/_StringProcessing/Regex/Match.swift @@ -10,11 +10,19 @@ //===----------------------------------------------------------------------===// extension Regex { + /// The result of matching a regex against a string. + /// + /// A `Match` forwards API to the `Output` generic parameter, + /// providing direct access to captures. @dynamicMemberLookup public struct Match { let input: String + + /// The range of the overall match public let range: Range + let rawCaptures: [StructuredCapture] + let referencedCaptureOffsets: [ReferenceID: Int] let value: Any? @@ -22,6 +30,7 @@ extension Regex { } extension Regex.Match { + /// The produced output from the match operation public var output: Output { if Output.self == AnyRegexOutput.self { let wholeMatchAsCapture = StructuredCapture( @@ -48,6 +57,7 @@ extension Regex.Match { } } + /// Lookup a capture by name or number public subscript(dynamicMember keyPath: KeyPath) -> T { output[keyPath: keyPath] } @@ -72,36 +82,89 @@ extension Regex.Match { } extension RegexComponent { - public func match(in input: String) -> Regex.Match? { - _match( - input, in: input.startIndex.. Regex.Match? { + try _match(s, in: s.startIndex.. Regex.Match? { + try _match(s, in: s.startIndex.. Regex.Match? { - _match( - input.base, in: input.startIndex.. Regex.Match? { + try _firstMatch(s, in: s.startIndex.. Regex.Match? { + try _match(s.base, in: s.startIndex.. Regex.Match? { + try _match(s.base, in: s.startIndex.. Regex.Match? { + try _firstMatch(s.base, in: s.startIndex.., mode: MatchMode = .wholeString - ) -> Regex.Match? { + ) throws -> Regex.Match? { let executor = Executor(program: regex.program.loweredProgram) - do { return try executor.match(input, in: inputRange, mode) - } catch { - fatalError(String(describing: error)) + } + + func _firstMatch( + _ input: String, + in inputRange: Range + ) throws -> Regex.Match? { + // FIXME: Something more efficient, likely an engine interface, and we + // should scrap the RegexConsumer crap and call this + + var low = inputRange.lowerBound + let high = inputRange.upperBound + while low < high { + if let m = try _match(input, in: low..(_ regex: R) -> Regex.Match? { - regex.match(in: self) + public func matchWhole(_ regex: R) -> Regex.Match? { + try? regex.matchWhole(self) + } + public func matchPrefix(_ regex: R) -> Regex.Match? { + try? regex.matchPrefix(self) } } extension Substring { - public func match(_ regex: R) -> Regex.Match? { - regex.match(in: self) + public func matchWhole(_ regex: R) -> Regex.Match? { + try? regex.matchWhole(self) + } + public func matchPrefix(_ regex: R) -> Regex.Match? { + try? regex.matchPrefix(self) } } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index b405a5399..7be95c28c 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -62,7 +62,7 @@ func customTest( let result: Match? switch call { case .match: - result = input.match(regex)?.output + result = input.matchWhole(regex)?.output case .firstMatch: result = input.firstMatch(of: regex)?.result } diff --git a/Tests/RegexBuilderTests/MotivationTests.swift b/Tests/RegexBuilderTests/MotivationTests.swift new file mode 100644 index 000000000..882ba6448 --- /dev/null +++ b/Tests/RegexBuilderTests/MotivationTests.swift @@ -0,0 +1,267 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// FIXME: macOS CI seems to be busted and Linux doesn't have FormatStyle +// So, we disable this file for now + +#if false + +import _MatchingEngine + +import XCTest +import _StringProcessing + +import RegexBuilder + +private struct Transaction: Hashable { + enum Kind: Hashable { + case credit + case debit + + init?(_ s: Substring) { + switch s.lowercased() { + case "credit": self = .credit + case "debit": self = .debit + default: return nil + } + } + } + + var kind: Kind + var date: Date + var account: String + var amount: Decimal +} +extension Transaction: CustomStringConvertible { + var description: String { + """ + kind: \(kind) + date: \(date) + account: \(account) + amount: \(amount) + """ + } +} + +private struct Statement { + var entries: [Transaction] + init(_ entries: S) where S.Element == Transaction { + self.entries = Array(entries) + } +} + +// In contrast to unit tests, or small functional tests, these +// test full workloads or perform real(ish) tasks. +// +// TODO: Consider adapting into Exercises or benchmark target... + +private let statement = """ +CREDIT 03/02/2022 Payroll $200.23 +CREDIT 03/03/2022 Sanctioned Individual A $2,000,000.00 +DEBIT 03/03/2022 Totally Legit Shell Corp $2,000,000.00 +DEBIT 03/05/2022 Beanie Babies Are Back $57.33 +""" + +private func processEntry(_ s: String) -> Transaction? { + var slice = s[...] + guard let kindEndIdx = slice.firstIndex(of: " "), + let kind = Transaction.Kind(slice[.. Transaction? { + let nsRegEx = try! NSRegularExpression(pattern: pattern) + + let range = NSRange(line.startIndex.. Transaction? { + // FIXME: Shouldn't this init throw? + let regex = try! Regex(compiling: pattern) + +// guard let result = line.match(regex) else { return nil } +// +// // TODO: We should have Regex or somesuch and `.1` +// // should be the same as `\1`. +// let dynCaps = result.1 +// +// +// let kind = Transaction.Kind(result.1.first!.capture as Substring) + + return nil +} + +@available(macOS 12.0, *) +private func processWithRuntimeStaticRegex(_ line: String) -> Transaction? { + let regex: Regex<(Substring, Substring, Substring, Substring, Substring)> + = try! Regex(compiling: pattern) + + return process(line, using: regex) +} + +@available(macOS 12.0, *) +private func processWithDSL(_ line: String) -> Transaction? { + let fieldSeparator = Regex { + CharacterClass.whitespace + OneOrMore(.whitespace) + } + + let regex = Regex { + Capture(OneOrMore(.word)) + fieldSeparator + + Capture(OneOrMore(.whitespace.inverted)) + fieldSeparator + + Capture { + OneOrMore { + Lookahead( + // FIXME: `fieldSeparator` differs, why? + Regex { + CharacterClass.whitespace + CharacterClass.whitespace + }, negative: true) + CharacterClass.any + } + } + fieldSeparator + + Capture { OneOrMore(.any) } + } + + return process(line, using: regex) +} + +@available(macOS 12.0, *) +private func process( + _ line: String, + using regex: Regex<(Substring, Substring, Substring, Substring, Substring)> +) -> Transaction? { + guard let output = try? regex.matchWhole(line), + let kind = Transaction.Kind(output.1) + else { + return nil + } + + let dateStrat = Date.FormatStyle(date: .numeric).parseStrategy + guard let date = try? Date(String(output.2), strategy: dateStrat) else { + return nil + } + + let account = String(output.3) + + guard let amount = try? Decimal( + String(output.4), format: .currency(code: "USD") + ) else { + return nil + } + + return Transaction( + kind: kind, date: date, account: account, amount: amount) +} + +extension RegexDSLTests { + + // TODO: FormatStyle not available on Linux... + @available(macOS 12.0, *) + func testBankStatement() { + // TODO: Stop printing and start testing... + + for line in statement.split(separator: "\n") { + let line = String(line) + _ = processEntry(line) + + // NSRegularExpression + let referenceOutput = processWithNSRegularExpression(line)! + + XCTAssertEqual( + referenceOutput, processWithNSRegularExpression(line)) + + _ = processWithRuntimeDynamicRegex(line) + + // Static run-time regex + XCTAssertEqual( + referenceOutput, processWithRuntimeStaticRegex(line)) + + // DSL + let dslOut = processWithDSL(line)! + guard referenceOutput == dslOut else { + if referenceOutput.account != dslOut.account { + // FIXME: Bug in lookahead here? + continue + } + + XCTFail() + continue + } + + } + + } + +} + +#endif + diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 93a1dda65..50358734d 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -24,7 +24,7 @@ class RegexDSLTests: XCTestCase { ) throws { let regex = content() for (input, maybeExpectedCaptures) in tests { - let maybeMatch = input.match(regex) + let maybeMatch = input.matchWhole(regex) if let expectedCaptures = maybeExpectedCaptures { let match = try XCTUnwrap(maybeMatch, file: file, line: line) XCTAssertTrue( @@ -52,12 +52,12 @@ class RegexDSLTests: XCTestCase { } // Assert the inferred capture type. let _: (Substring, Substring, Int).Type = type(of: regex).Output.self - let maybeMatch = "ab1".match(regex) + let maybeMatch = "ab1".matchWhole(regex) let match = try XCTUnwrap(maybeMatch) XCTAssertTrue(match.output == ("ab1", "b", 1)) let substring = "ab1"[...] - let substringMatch = try XCTUnwrap(substring.match(regex)) + let substringMatch = try XCTUnwrap(substring.matchWhole(regex)) XCTAssertTrue(match.output == substringMatch.output) } @@ -73,7 +73,7 @@ class RegexDSLTests: XCTestCase { } func testMatchResultDotZeroWithoutCapture() throws { - let match = try XCTUnwrap("aaa".match { OneOrMore { "a" } }) + let match = try XCTUnwrap("aaa".matchWhole { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") } @@ -82,8 +82,8 @@ class RegexDSLTests: XCTestCase { let regex = ChoiceOf { "aaa" } - XCTAssertTrue("aaa".match(regex)?.output == "aaa") - XCTAssertNil("aab".match(regex)?.output) + XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") + XCTAssertNil("aab".matchWhole(regex)?.output) } do { let regex = ChoiceOf { @@ -91,10 +91,10 @@ class RegexDSLTests: XCTestCase { "bbb" "ccc" } - XCTAssertTrue("aaa".match(regex)?.output == "aaa") - XCTAssertNil("aab".match(regex)?.output) - XCTAssertTrue("bbb".match(regex)?.output == "bbb") - XCTAssertTrue("ccc".match(regex)?.output == "ccc") + XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") + XCTAssertNil("aab".matchWhole(regex)?.output) + XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb") + XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc") } do { let regex = Regex { @@ -109,7 +109,7 @@ class RegexDSLTests: XCTestCase { } } XCTAssertTrue( - try XCTUnwrap("abc".match(regex)?.output) == ("abc", "c")) + try XCTUnwrap("abc".matchWhole(regex)?.output) == ("abc", "c")) } do { let regex = ChoiceOf { @@ -117,18 +117,18 @@ class RegexDSLTests: XCTestCase { "bbb" "ccc" } - XCTAssertTrue("aaa".match(regex)?.output == "aaa") - XCTAssertNil("aab".match(regex)?.output) - XCTAssertTrue("bbb".match(regex)?.output == "bbb") - XCTAssertTrue("ccc".match(regex)?.output == "ccc") + XCTAssertTrue("aaa".matchWhole(regex)?.output == "aaa") + XCTAssertNil("aab".matchWhole(regex)?.output) + XCTAssertTrue("bbb".matchWhole(regex)?.output == "bbb") + XCTAssertTrue("ccc".matchWhole(regex)?.output == "ccc") } do { let regex = ChoiceOf { Capture("aaa") } XCTAssertTrue( - try XCTUnwrap("aaa".match(regex)?.output) == ("aaa", "aaa")) - XCTAssertNil("aab".match(regex)?.output) + try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa")) + XCTAssertNil("aab".matchWhole(regex)?.output) } do { let regex = ChoiceOf { @@ -137,12 +137,12 @@ class RegexDSLTests: XCTestCase { Capture("ccc") } XCTAssertTrue( - try XCTUnwrap("aaa".match(regex)?.output) == ("aaa", "aaa", nil, nil)) + try XCTUnwrap("aaa".matchWhole(regex)?.output) == ("aaa", "aaa", nil, nil)) XCTAssertTrue( - try XCTUnwrap("bbb".match(regex)?.output) == ("bbb", nil, "bbb", nil)) + try XCTUnwrap("bbb".matchWhole(regex)?.output) == ("bbb", nil, "bbb", nil)) XCTAssertTrue( - try XCTUnwrap("ccc".match(regex)?.output) == ("ccc", nil, nil, "ccc")) - XCTAssertNil("aab".match(regex)?.output) + try XCTUnwrap("ccc".matchWhole(regex)?.output) == ("ccc", nil, nil, "ccc")) + XCTAssertNil("aab".matchWhole(regex)?.output) } } @@ -342,7 +342,7 @@ class RegexDSLTests: XCTestCase { // Assert the inferred capture type. let _: Substring.Type = type(of: regex).Output.self let input = "123123" - let match = try XCTUnwrap(input.match(regex)?.output) + let match = try XCTUnwrap(input.matchWhole(regex)?.output) XCTAssertTrue(match == input) } @@ -469,7 +469,7 @@ class RegexDSLTests: XCTestCase { let unicodeLine = "1BCA0..1BCA3 ; Control # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP" - let match = try XCTUnwrap(unicodeLine.match(unicodeData)) + let match = try XCTUnwrap(unicodeLine.matchWhole(unicodeData)) XCTAssertEqual(match.0, Substring(unicodeLine)) XCTAssertEqual(match.1, "Control") } @@ -501,7 +501,7 @@ class RegexDSLTests: XCTestCase { Substring, Unicode.Scalar?, Unicode.Scalar??, Substring ) let _: ExpectedMatch.Type = type(of: regexWithCapture).Output.self - let maybeMatchResult = line.match(regexWithCapture) + let maybeMatchResult = line.matchWhole(regexWithCapture) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -536,7 +536,7 @@ class RegexDSLTests: XCTestCase { Substring, Unicode.Scalar, Unicode.Scalar?, Substring ) let _: ExpectedMatch.Type = type(of: regexWithTryCapture).Output.self - let maybeMatchResult = line.match(regexWithTryCapture) + let maybeMatchResult = line.matchWhole(regexWithTryCapture) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -549,7 +549,7 @@ class RegexDSLTests: XCTestCase { let regexLiteral = try MockRegexLiteral( #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#, matching: (Substring, Substring, Substring?, Substring).self) - let maybeMatchResult = line.match(regexLiteral) + let maybeMatchResult = line.matchWhole(regexLiteral) let matchResult = try XCTUnwrap(maybeMatchResult) let (wholeMatch, lower, upper, propertyString) = matchResult.output XCTAssertEqual(wholeMatch, Substring(line)) @@ -561,21 +561,21 @@ class RegexDSLTests: XCTestCase { func testDynamicCaptures() throws { do { - let regex = try Regex("aabcc.") + let regex = try Regex(compiling: "aabcc.") let line = "aabccd" - let match = try XCTUnwrap(line.match(regex)) + let match = try XCTUnwrap(line.matchWhole(regex)) XCTAssertEqual(match.0, line[...]) let output = match.output XCTAssertEqual(output[0].substring, line[...]) } do { let regex = try Regex( - #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#) + compiling: #"([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+(\w+).*"#) let line = """ A6F0..A6F1 ; Extend # Mn [2] BAMUM COMBINING MARK KOQNDON..BAMUM \ COMBINING MARK TUKWENTIS """ - let match = try XCTUnwrap(line.match(regex)) + let match = try XCTUnwrap(line.matchWhole(regex)) XCTAssertEqual(match.0, line[...]) let output = match.output XCTAssertEqual(output[0].substring, line[...]) @@ -640,7 +640,7 @@ class RegexDSLTests: XCTestCase { } } let input = "abc#41#42abc#42#42" - let result = try XCTUnwrap(input.match(regex)) + let result = try XCTUnwrap(input.matchWhole(regex)) XCTAssertEqual(result[a], "abc") XCTAssertEqual(result[b], 42) } @@ -720,7 +720,7 @@ class RegexDSLTests: XCTestCase { let parser = SemanticVersionParser() for (str, version) in versions { - XCTAssertEqual(str.match(parser)?.output, version) + XCTAssertEqual(str.matchWhole(parser)?.output, version) } } } diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index b51f12100..bc80746c6 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -32,7 +32,7 @@ class RegexConsumerTests: XCTestCase { _ expected: [Range], file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(regex) + let regex = try! Regex(compiling: regex) let actualSeq: [Range] = string[...].ranges(of: regex).map { let start = string.offset(ofIndex: $0.lowerBound) @@ -77,7 +77,7 @@ class RegexConsumerTests: XCTestCase { _ expected: [Substring], file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(regex) + let regex = try! Regex(compiling: regex) let actual = Array(string.split(by: regex)) XCTAssertEqual(actual, expected, file: file, line: line) } @@ -97,7 +97,7 @@ class RegexConsumerTests: XCTestCase { _ expected: String, file: StaticString = #file, line: UInt = #line ) { - let regex = try! Regex(regex) + let regex = try! Regex(compiling: regex) let actual = string.replacing(regex, with: replacement) XCTAssertEqual(actual, expected, file: file, line: line) } @@ -116,7 +116,7 @@ class RegexConsumerTests: XCTestCase { } func testAdHoc() { - let r = try! Regex("a|b+") + let r = try! Regex(compiling: "a|b+") XCTAssert("palindrome".contains(r)) XCTAssert("botany".contains(r)) From 43a78e81762c74a10da091df56da7b7d4179c5f5 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 1 Apr 2022 23:41:31 +0100 Subject: [PATCH 16/17] Update escaping rules in RegexSyntax.md Tweak the text to say that e.g `\I` is invalid. --- Documentation/Evolution/RegexSyntax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 958d52f7d..2b86f99f7 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -160,7 +160,7 @@ Atom -> Anchor | '\'? ``` -Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded with a backslash, but it has no effect if they are unknown escape sequences, e.g `\I` is literal `I`. +Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid. #### Anchors From 2aa67f8f813e835d47f8ef614add8689247a8ec0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Mon, 4 Apr 2022 10:49:51 +0100 Subject: [PATCH 17/17] Update Documentation/Evolution/RegexSyntax.md Co-authored-by: Michael Ilseman --- Documentation/Evolution/RegexSyntax.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/Evolution/RegexSyntax.md b/Documentation/Evolution/RegexSyntax.md index 2b86f99f7..faa327176 100644 --- a/Documentation/Evolution/RegexSyntax.md +++ b/Documentation/Evolution/RegexSyntax.md @@ -160,7 +160,7 @@ Atom -> Anchor | '\'? ``` -Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid. +Atoms are the smallest units of regex syntax. They include escape sequences, metacharacters, backreferences, etc. The most basic form of atom is a literal character. A metacharacter may be treated as literal by preceding it with a backslash. Other literal characters may also be preceded by a backslash, in which case it has no effect, e.g `\%` is literal `%`. However this does not apply to either non-whitespace Unicode characters, or to unknown ASCII letter character escapes, e.g `\I` is invalid and would produce an error. #### Anchors