From 863aebe9201d58df6185bce1d33d9d0dda29b78b Mon Sep 17 00:00:00 2001 From: Alejandro Alonso Date: Thu, 14 Jul 2022 15:56:59 -0700 Subject: [PATCH 01/28] Merge pull request #575 from Azoy/various-tidbits Rename various APIs --- Sources/RegexBuilder/Anchor.swift | 12 ++++++++++++ Sources/_StringProcessing/Regex/Options.swift | 10 +++++----- Tests/RegexBuilderTests/RegexDSLTests.swift | 2 +- Tests/RegexTests/UTS18Tests.swift | 2 +- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 31a3e8a0d..28fc7e8d1 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -104,6 +104,12 @@ extension Anchor { /// /// This anchor is equivalent to `^` in regex syntax when the `m` option /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// + /// For example, the following regexes are all equivalent: + /// + /// - `Regex { Anchor.startOfLine }` + /// - `/(?m)^/` or `/(?m:^)/` + /// - `/^/.anchorsMatchLineEndings(true)` public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } @@ -113,6 +119,12 @@ extension Anchor { /// /// This anchor is equivalent to `$` in regex syntax when the `m` option /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// + /// For example, the following regexes are all equivalent: + /// + /// - `Regex { Anchor.endOfLine }` + /// - `/(?m)$/` or `/(?m:$)/` + /// - `/$/.anchorsMatchLineEndings(true)` public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 24d5c422e..c66502d96 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -65,7 +65,7 @@ extension RegexComponent { /// - Parameter wordBoundaryKind: The algorithm to use for determining word boundaries. /// - Returns: The modified regular expression. public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .default) } /// Returns a regular expression where the start and end of input @@ -83,8 +83,8 @@ extension RegexComponent { /// /// This method corresponds to applying the `m` option in regex syntax. For /// this behavior in the `RegexBuilder` syntax, see - /// ``Anchor.startOfLine``, ``Anchor.endOfLine``, ``Anchor.startOfInput``, - /// and ``Anchor.endOfInput``. + /// ``Anchor.startOfLine``, ``Anchor.endOfLine``, ``Anchor.startOfSubject``, + /// and ``Anchor.endOfSubject``. /// /// - Parameter matchLineEndings: A Boolean value indicating whether `^` and /// `$` should match the start and end of lines, respectively. @@ -205,7 +205,7 @@ public struct RegexWordBoundaryKind: Hashable { /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input /// and a `\w` character. Word boundaries therefore depend on the option- /// defined behavior of `\w`. - public static var unicodeLevel1: Self { + public static var simple: Self { .init(base: .unicodeLevel1) } @@ -215,7 +215,7 @@ public struct RegexWordBoundaryKind: Hashable { /// Default word boundaries use a Unicode algorithm that handles some cases /// better than simple word boundaries, such as words with internal /// punctuation, changes in script, and Emoji. - public static var unicodeLevel2: Self { + public static var `default`: Self { .init(base: .unicodeLevel2) } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..da85af28e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -307,7 +307,7 @@ class RegexDSLTests: XCTestCase { OneOrMore(.word) Anchor.wordBoundary } - .wordBoundaryKind(.unicodeLevel1) + .wordBoundaryKind(.simple) OneOrMore(.any, .reluctant) "stop" } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index fa8a1729d..aa3639ea6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -222,7 +222,7 @@ extension UTS18Tests { // - Nonspacing marks are never divided from their base characters, and // otherwise ignored in locating boundaries. func testSimpleWordBoundaries() { - let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.simple) expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) expectFirstMatch("don't", simpleWordRegex, "don") expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") From 1079533bedbf086288bed32c69d3af49fa12efb5 Mon Sep 17 00:00:00 2001 From: Alejandro Alonso Date: Thu, 14 Jul 2022 16:17:28 -0700 Subject: [PATCH 02/28] Merge pull request #576 from Azoy/options-regex Move options from RegexComponent to Regex --- Sources/_StringProcessing/Regex/Options.swift | 2 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 69 ++++++++++++------- 2 files changed, 44 insertions(+), 27 deletions(-) diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index c66502d96..88d2dbf5d 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -12,7 +12,7 @@ @_implementationOnly import _RegexParser @available(SwiftStdlib 5.7, *) -extension RegexComponent { +extension Regex { /// Returns a regular expression that ignores case when matching. /// /// - Parameter ignoresCase: A Boolean value indicating whether to ignore case. diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index da85af28e..47d2ebf02 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -234,8 +234,10 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcABCaBc", "abcABCaBc"), matchType: Substring.self, ==) { - OneOrMore { - "abc" + Regex { + OneOrMore { + "abc" + } }.ignoresCase(true) } @@ -247,8 +249,10 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcABCaBc", "abcABCaBc"), matchType: Substring.self, ==) { - OneOrMore { - "abc" + Regex { + OneOrMore { + "abc" + } } .ignoresCase(true) .ignoresCase(false) @@ -264,9 +268,13 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcdeABCdeaBcde", "abcdeABCdeaBcde"), matchType: Substring.self, ==) { - OneOrMore { - "abc".ignoresCase(true) - Optionally("de") + Regex { + OneOrMore { + Regex { + "abc" + }.ignoresCase(true) + Optionally("de") + } } .ignoresCase(false) } @@ -303,11 +311,13 @@ class RegexDSLTests: XCTestCase { "stop" " " - Capture { - OneOrMore(.word) - Anchor.wordBoundary - } - .wordBoundaryKind(.simple) + Regex { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + }.wordBoundaryKind(.simple) + OneOrMore(.any, .reluctant) "stop" } @@ -317,15 +327,17 @@ class RegexDSLTests: XCTestCase { matchType: (Substring, Substring, Substring).self, ==) { Capture { // Reluctant behavior due to option - OneOrMore(.anyOf("abcd")) - .repetitionBehavior(.reluctant) + Regex { + OneOrMore(.anyOf("abcd")) + }.repetitionBehavior(.reluctant) } ZeroOrMore("a"..."z") Capture { // Eager behavior due to explicit parameter, despite option - OneOrMore(.digit, .eager) - .repetitionBehavior(.reluctant) + Regex { + OneOrMore(.digit, .eager) + }.repetitionBehavior(.reluctant) } ZeroOrMore(.digit) } @@ -334,10 +346,11 @@ class RegexDSLTests: XCTestCase { ("abcdefg", ("abcdefg", "abcdefg")), ("abcdéfg", ("abcdéfg", "abcd")), matchType: (Substring, Substring).self, ==) { - Capture { - OneOrMore(.word) - } - .asciiOnlyWordCharacters() + Regex { + Capture { + OneOrMore(.word) + } + }.asciiOnlyWordCharacters() ZeroOrMore(.any) } @@ -368,8 +381,10 @@ class RegexDSLTests: XCTestCase { ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore(.reluctant) { - One(.word) + Regex { + OneOrMore(.reluctant) { + One(.word) + } }.repetitionBehavior(.possessive) Capture(.digit) ZeroOrMore(.any) @@ -421,8 +436,9 @@ class RegexDSLTests: XCTestCase { { Regex { Capture { - OneOrMore("a") - .repetitionBehavior(.eager) + Regex { + OneOrMore("a") + }.repetitionBehavior(.eager) } OneOrMore("a") }.repetitionBehavior(.possessive) @@ -434,8 +450,9 @@ class RegexDSLTests: XCTestCase { { Regex { Capture { - OneOrMore("a") - .repetitionBehavior(.reluctant) + Regex { + OneOrMore("a") + }.repetitionBehavior(.reluctant) } OneOrMore("a") }.repetitionBehavior(.possessive) From 5966a5ca801f137f9490d284f7ea5d506344b502 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:37 +0100 Subject: [PATCH 03/28] Allow matching tests to specify semantic level --- Tests/RegexTests/MatchTests.swift | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d375065ab..bb0d2b63e 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -24,9 +24,10 @@ func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, + semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?]) { - var regex = try Regex(regexStr, syntax: syntax) + var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) guard let result = try regex.firstMatch(in: input) else { throw MatchError("match not found for \(regexStr) in \(input)") } @@ -54,6 +55,7 @@ func flatCaptureTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -63,6 +65,7 @@ func flatCaptureTest( regex, input: test, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax ) else { if expect == nil { @@ -113,6 +116,7 @@ func matchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -126,6 +130,7 @@ func matchTest( dumpAST: dumpAST, xfail: xfail, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, file: file, line: line) } @@ -143,6 +148,7 @@ func firstMatchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -151,6 +157,7 @@ func firstMatchTest( regex, input: input, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax) if xfail { From fe63fb4d4d4184ea2c2959b9e15a1bd4b0471007 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:38 +0100 Subject: [PATCH 04/28] Rip out unused _CharacterClassModel API Remove the DSL -> _CharacterClassModel conversion, and _CharacterClassModel's custom character class matching logic, none of which is being used. --- .../_CharacterClassModel.swift | 192 ------------------ 1 file changed, 192 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..c1183972b 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -50,74 +50,6 @@ public struct _CharacterClassModel: Hashable { case whitespace /// Character.isLetter or Character.isDigit or Character == "_" case word - /// One of the custom character set. - case custom([CharacterSetComponent]) - } - - public enum SetOperator: Hashable { - case subtraction - case intersection - case symmetricDifference - } - - /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { - var lhs: CharacterSetComponent - var op: SetOperator - var rhs: CharacterSetComponent - - func matches(_ c: Character, with options: MatchingOptions) -> Bool { - switch op { - case .intersection: - return lhs.matches(c, with: options) && rhs.matches(c, with: options) - case .subtraction: - return lhs.matches(c, with: options) && !rhs.matches(c, with: options) - case .symmetricDifference: - return lhs.matches(c, with: options) != rhs.matches(c, with: options) - } - } - } - - public enum CharacterSetComponent: Hashable { - case character(Character) - case range(ClosedRange) - - /// A nested character class. - case characterClass(_CharacterClassModel) - - /// A binary set operation of character class components. - indirect case setOperation(SetOperation) - - public static func setOperation( - lhs: CharacterSetComponent, op: SetOperator, rhs: CharacterSetComponent - ) -> CharacterSetComponent { - .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) - } - - func matches(_ character: Character, with options: MatchingOptions) -> Bool { - switch self { - case .character(let c): - if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() - } else { - return c == character - } - case .range(let range): - if options.isCaseInsensitive { - let newLower = range.lowerBound.lowercased() - let newUpper = range.upperBound.lowercased() - // FIXME: Is failing this possible? Is this the right behavior if so? - guard newLower <= newUpper else { return false } - return (newLower...newUpper).contains(character.lowercased()) - } else { - return range.contains(character) - } - case .characterClass(let custom): - let str = String(character) - return custom.matches(in: str, at: str.startIndex, with: options) != nil - case .setOperation(let op): return op.matches(character, with: options) - } - } } enum MatchLevel: Hashable { @@ -188,8 +120,6 @@ public struct _CharacterClassModel: Hashable { matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -222,8 +152,6 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() @@ -286,23 +214,6 @@ extension _CharacterClassModel { public static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } - - public static func custom( - _ components: [_CharacterClassModel.CharacterSetComponent] - ) -> _CharacterClassModel { - .init(cc: .custom(components), matchLevel: .graphemeCluster) - } -} - -extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { - public var description: String { - switch self { - case .range(let range): return "" - case .character(let character): return "" - case .characterClass(let custom): return "\(custom)" - case .setOperation(let op): return "<\(op.lhs) \(op.op) \(op.rhs)>" - } - } } extension _CharacterClassModel.Representation: CustomStringConvertible { @@ -318,7 +229,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .verticalWhitespace: return "vertical whitespace" case .whitespace: return "" case .word: return "" - case .custom(let set): return "" } } } @@ -391,22 +301,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Node { - var characterClass: _CharacterClassModel? { - switch self { - case let .customCharacterClass(ccc): - return ccc.modelCharacterClass - case let .atom(a): - return a.characterClass - case .characterPredicate: - // FIXME: Do we make one from this? - return nil - default: - return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel @@ -417,17 +311,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Atom { - var characterClass: _CharacterClassModel? { - switch self { - case let .unconverted(a): - return a.ast.characterClass - - default: return nil - } - } -} - extension AST.Atom { var characterClass: _CharacterClassModel? { switch kind { @@ -489,81 +372,6 @@ extension AST.Atom.EscapedBuiltin { } } -extension DSLTree.CustomCharacterClass { - // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: _CharacterClassModel? { - var result = - Array<_CharacterClassModel.CharacterSetComponent>() - for m in members { - switch m { - case let .atom(a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let c = a.literalCharacterValue { - result.append(.character(c)) - } else { - return nil - } - case let .range(low, high): - guard let lhs = low.literalCharacterValue, - let rhs = high.literalCharacterValue - else { - return nil - } - result.append(.range(lhs...rhs)) - - case let .custom(ccc): - guard let cc = ccc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - - case let .intersection(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .intersection, - rhs: .characterClass(rhs))) - - case let .subtraction(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .subtraction, - rhs: .characterClass(rhs))) - - case let .symmetricDifference(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .symmetricDifference, - rhs: .characterClass(rhs))) - - case let .quotedLiteral(s): - // Decompose quoted literal into literal characters. - result += s.map { .character($0) } - - case .trivia: - break - } - } - let cc = _CharacterClassModel.custom(result) - return isInverted ? cc.inverted : cc - } -} - extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the From b309fa54e6abb2fbd292783b88b9ef6f43ee49a6 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:38 +0100 Subject: [PATCH 05/28] Remove _CharacterClassModel conformance to RegexComponent --- Sources/_StringProcessing/_CharacterClassModel.swift | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c1183972b..e280ba473 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -161,18 +161,6 @@ public struct _CharacterClassModel: Hashable { } } -@available(SwiftStdlib 5.7, *) -extension _CharacterClassModel: RegexComponent { - public typealias RegexOutput = Substring - - public var regex: Regex { - guard let ast = self.makeAST() else { - fatalError("FIXME: extended AST?") - } - return Regex(ast: ast) - } -} - @_spi(RegexBuilder) extension _CharacterClassModel { public static var any: _CharacterClassModel { From 0ab30798291d5edf943d4e3bdc8f57ee4694cc8a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:39 +0100 Subject: [PATCH 06/28] Internalize `_CharacterClassModel` `makeDSLTreeCharacterClass` was the last API that required it to be public. Remove it, and replace it with some static members on `_AST.Atom`. --- Sources/RegexBuilder/CharacterClass.swift | 21 ++--- Sources/_StringProcessing/Regex/DSLTree.swift | 26 +++++ .../_CharacterClassModel.swift | 94 +++---------------- 3 files changed, 50 insertions(+), 91 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index a6d18b2cf..4e96e510d 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -20,11 +20,8 @@ public struct CharacterClass { self.ccc = ccc } - init(unconverted model: _CharacterClassModel) { - guard let ccc = model.makeDSLTreeCharacterClass() else { - fatalError("Unsupported character class") - } - self.ccc = ccc + init(unconverted atom: DSLTree._AST.Atom) { + self.ccc = .init(members: [.atom(.unconverted(atom))]) } } @@ -49,15 +46,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: .anyGrapheme) + .init(unconverted: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) + .init(unconverted: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: .digit) + .init(unconverted: ._digit) } public static var hexDigit: CharacterClass { @@ -69,19 +66,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + .init(unconverted: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + .init(unconverted: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: .verticalWhitespace) + .init(unconverted: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: .word) + .init(unconverted: ._word) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 740bdcb8d..eb357ae87 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -820,6 +820,32 @@ extension DSLTree { @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom + + // FIXME: The below APIs should be removed once the DSL tree has been + // migrated to use proper DSL atoms for them. + + public static var _anyGrapheme: Self { + .init(ast: .init(.escaped(.graphemeCluster), .fake)) + } + public static var _whitespace: Self { + .init(ast: .init(.escaped(.whitespace), .fake)) + } + public static var _digit: Self { + .init(ast: .init(.escaped(.decimalDigit), .fake)) + } + public static var _horizontalWhitespace: Self { + .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) + } + public static var _newlineSequence: Self { + // FIXME: newline sequence is not same as \n + .init(ast: .init(.escaped(.newline), .fake)) + } + public static var _verticalWhitespace: Self { + .init(ast: .init(.escaped(.verticalTab), .fake)) + } + public static var _word: Self { + .init(ast: .init(.escaped(.wordCharacter), .fake)) + } } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index e280ba473..c0de6ebaa 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,8 +15,7 @@ // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -@_spi(RegexBuilder) -public struct _CharacterClassModel: Hashable { +struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -28,7 +27,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -85,7 +84,7 @@ public struct _CharacterClassModel: Hashable { } /// Inverts a character class. - public var inverted: Self { + var inverted: Self { return withInversion(true) } @@ -161,51 +160,50 @@ public struct _CharacterClassModel: Hashable { } } -@_spi(RegexBuilder) extension _CharacterClassModel { - public static var any: _CharacterClassModel { + static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: _CharacterClassModel { + static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var anyUnicodeScalar: _CharacterClassModel { + static var anyUnicodeScalar: _CharacterClassModel { .init(cc: .any, matchLevel: .unicodeScalar) } - public static var whitespace: _CharacterClassModel { + static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: _CharacterClassModel { + static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: _CharacterClassModel { + static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: _CharacterClassModel { + static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: _CharacterClassModel { + static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: _CharacterClassModel { + static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: _CharacterClassModel { + static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } } extension _CharacterClassModel.Representation: CustomStringConvertible { - public var description: String { + var description: String { switch self { case .any: return "" case .anyGrapheme: return "" @@ -222,73 +220,11 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { } extension _CharacterClassModel: CustomStringConvertible { - public var description: String { + var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension _CharacterClassModel { - public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch makeAST() { - case .atom(let atom): - return .init(members: [.atom(.unconverted(.init(ast: atom)))]) - default: - return nil - } - } - - internal func makeAST() -> AST.Node? { - let inv = isInverted - - func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { - escaped(b) - } - - switch cc { - case .any: return atom(.any) - - case .digit: - return esc(inv ? .notDecimalDigit : .decimalDigit) - - case .horizontalWhitespace: - return esc( - inv ? .notHorizontalWhitespace : .horizontalWhitespace) - - // FIXME: newline sequence is not same as \n - case .newlineSequence: - return esc(inv ? .notNewline : .newline) - - case .whitespace: - return esc(inv ? .notWhitespace : .whitespace) - - case .verticalWhitespace: - return esc(inv ? .notVerticalTab : .verticalTab) - - case .word: - return esc(inv ? .notWordCharacter : .wordCharacter) - - case .anyGrapheme: - return esc(.graphemeCluster) - - case .hexDigit: - let members: [AST.CustomCharacterClass.Member] = [ - range_m(.char("a"), .char("f")), - range_m(.char("A"), .char("F")), - range_m(.char("0"), .char("9")), - ] - let ccc = AST.CustomCharacterClass( - .init(faking: inv ? .inverted : .normal), - members, - .fake) - - return .customCharacterClass(ccc) - - default: return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel From b454390b6b4b2bb89725fac1c10f225ec7840688 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:39 +0100 Subject: [PATCH 07/28] Fix `CharacterClass.newlineSequence` Map to `.newlineSequence` instead of `.newline`, which allows it to create the correct consumer. rdar://96330096 --- Sources/_StringProcessing/Regex/DSLTree.swift | 3 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 110 ++++++++++++++++++ Tests/RegexTests/MatchTests.swift | 43 +++++++ 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index eb357ae87..dc695cbf1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -837,8 +837,7 @@ extension DSLTree { .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) } public static var _newlineSequence: Self { - // FIXME: newline sequence is not same as \n - .init(ast: .init(.escaped(.newline), .fake)) + .init(ast: .init(.escaped(.newlineSequence), .fake)) } public static var _verticalWhitespace: Self { .init(ast: .init(.escaped(.verticalTab), .fake)) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 47d2ebf02..995a67b3e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -110,6 +110,116 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + // `.newlineSequence` and `.verticalWhitespace` match the same set of + // newlines in grapheme semantic mode, and scalar mode when applied with + // OneOrMore. + for cc in [CharacterClass.newlineSequence, .verticalWhitespace] { + for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], allNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode) + } + + // Try with ASCII-only whitespace. + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], asciiNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode).asciiOnlyWhitespace() + } + } + } + + // `.newlineSequence` in scalar mode may match a single `\r\n`. + // `.verticalWhitespace` may not. + for asciiOnly in [true, false] { + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", "\r\n"), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + "\n" + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + } } func testCharacterClassOperations() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index bb0d2b63e..8f7baf4b9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -634,6 +634,49 @@ extension RegexTests { ("\n", true), ("\r", true)) + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + firstMatchTest( + #"\R+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + firstMatchTest( + #"\v+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + } + + // In scalar mode, \R can match \r\n, \v cannot. + firstMatchTest( + #"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar) + + // ASCII-only spaces. + firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest( + #"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters From 8e920c9d607bfb1d399a7fe4ecda686a484bc952 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 08/28] Rename `any` -> `dot` Explicitly disambiguate the fact we're talking about `.`, which does not match newlines unless in single line mode. --- Sources/RegexBuilder/CharacterClass.swift | 2 +- Sources/_RegexParser/Regex/AST/Atom.swift | 8 ++++---- .../_RegexParser/Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_RegexParser/Regex/Parse/Sema.swift | 4 ++-- Sources/_RegexParser/Regex/Printing/DumpAST.swift | 2 +- Sources/_StringProcessing/ByteCodeGen.swift | 10 +++++----- Sources/_StringProcessing/ConsumerInterface.swift | 6 +++--- Sources/_StringProcessing/PrintAsPattern.swift | 10 ++++++---- Sources/_StringProcessing/Regex/ASTConversion.swift | 2 +- Sources/_StringProcessing/Regex/DSLTree.swift | 13 ++++++++----- .../_StringProcessing/_CharacterClassModel.swift | 6 +++--- Tests/RegexTests/ParseTests.swift | 10 +++++----- 12 files changed, 40 insertions(+), 35 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4e96e510d..8b4a21fb7 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -42,7 +42,7 @@ extension CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { public static var any: CharacterClass { - .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) + .init(DSLTree.CustomCharacterClass(members: [.atom(.dot)])) } public static var anyGraphemeCluster: CharacterClass { diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f1419ad78..2e39c9c4c 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -60,7 +60,7 @@ extension AST { case namedCharacter(String) /// . - case any + case dot /// ^ case startOfLine @@ -104,7 +104,7 @@ extension AST.Atom { case .callout(let v): return v case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v - case .any: return nil + case .dot: return nil case .startOfLine: return nil case .endOfLine: return nil case .invalid: return nil @@ -806,7 +806,7 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .any, .startOfLine, .endOfLine, + case .scalarSequence, .property, .dot, .startOfLine, .endOfLine, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -858,7 +858,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .any, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 2168dbb03..d14a17785 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2073,7 +2073,7 @@ extension Parser { p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters - case ".": return customCC ? .char(".") : .any + case ".": return customCC ? .char(".") : .dot case "^": return customCC ? .char("^") : .startOfLine case "$": return customCC ? .char("$") : .endOfLine diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 0aeee282d..88744bae2 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -221,7 +221,7 @@ extension RegexValidator { ) { switch esc { case .resetStartOfMatch, .singleDataUnit, .trueAnychar, - // '\N' needs to be emitted using 'emitAny'. + // '\N' needs to be emitted using 'emitDot'. .notNewline: error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .any: + case .char, .scalar, .startOfLine, .endOfLine, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 48a2512cf..68c32e4a1 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,7 +153,7 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .any: return "." + case .dot: return "." case .startOfLine: return "^" case .endOfLine: return "$" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..07fe09016 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -55,8 +55,8 @@ fileprivate extension Compiler.ByteCodeGen { } } switch a { - case .any: - emitAny() + case .dot: + emitDot() case let .char(c): try emitCharacter(c) @@ -282,7 +282,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitAny() { + mutating func emitDot() { switch (options.semanticLevel, options.dotMatchesNewline) { case (.graphemeCluster, true): builder.buildAdvance(1) @@ -758,9 +758,9 @@ fileprivate extension Compiler.ByteCodeGen { try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): - if ccc.containsAny { + if ccc.containsDot { if !ccc.isInverted { - emitAny() + emitDot() } else { throw Unsupported("Inverted any") } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..43ec45f6c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .any: + case .dot: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in @@ -264,10 +264,10 @@ extension AST.Atom { case let .namedCharacter(name): return consumeName(name, opts: opts) - case .any: + case .dot: assertionFailure( "Should have been handled by tree conversion") - fatalError(".atom(.any) is handled in emitAny") + fatalError(".atom(.dot) is handled in emitDot") case .startOfLine, .endOfLine: // handled in emitAssertion diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..d936e73b7 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -895,7 +895,8 @@ extension AST.Atom { case .namedCharacter: return (" /* TODO: named character */", false) - case .any: + case .dot: + // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) case .startOfLine, .endOfLine: @@ -950,7 +951,7 @@ extension AST.Atom { case .namedCharacter(let n): return "\\N{\(n)}" - case .any: + case .dot: return "." case .startOfLine, .endOfLine: @@ -1099,7 +1100,8 @@ extension DSLTree.Atom { _ printer: inout PrettyPrinter ) -> (String, canBeWrapped: Bool)? { switch self { - case .any: + case .dot: + // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) case let .char(c): @@ -1141,7 +1143,7 @@ extension DSLTree.Atom { var _regexBase: String { switch self { - case .any: + case .dot: return "." case let .char(c): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..12068e1bc 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -217,7 +217,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) case let .scalar(s): return .char(Character(s.value)) - case .any: return .any + case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index dc695cbf1..56cec73ab 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -117,11 +117,11 @@ extension DSLTree { var members: [Member] var isInverted: Bool - var containsAny: Bool { + var containsDot: Bool { members.contains { member in switch member { - case .atom(.any): return true - case .custom(let ccc): return ccc.containsAny + case .atom(.dot): return true + case .custom(let ccc): return ccc.containsDot default: return false } @@ -245,7 +245,10 @@ extension DSLTree { public enum Atom { case char(Character) case scalar(Unicode.Scalar) - case any + + /// The DSL representation of '.' in a regex literal. This does not match + /// newlines unless single line mode is enabled. + case dot case assertion(_AST.AssertionKind) case backreference(_AST.Reference) @@ -857,7 +860,7 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .dot, .backreference, .symbolicReference, .unconverted: return true } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c0de6ebaa..9f515f220 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -245,8 +245,8 @@ extension AST.Atom { // this? Or does grapheme-semantic mode complicate that? return nil - case .any: - // `.any` is handled in the matching engine by Compiler.emitAny() and in + case .dot: + // `.dot` is handled in the matching engine by Compiler.emitDot() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure // @@ -275,7 +275,7 @@ extension AST.Atom.EscapedBuiltin { // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through - // emitAny(). For now we treat it as semantically invalid. + // emitDot(). For now we treat it as semantically invalid. case .notNewline: return .newlineSequence.inverted case .whitespace: return .whitespace diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 3c43f27af..52a272915 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -359,14 +359,14 @@ extension RegexTests { parseTest( "(.)*(.*)", concat( - zeroOrMore(of: capture(atom(.any))), - capture(zeroOrMore(of: atom(.any)))), + zeroOrMore(of: capture(atom(.dot))), + capture(zeroOrMore(of: atom(.dot)))), captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( - zeroOrMore(of: capture(capture(atom(.any)))), - capture(zeroOrOne(of: capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.dot)))), + capture(zeroOrOne(of: capture(atom(.dot))))), captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, @@ -479,7 +479,7 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) - // FIXME: '\N' should be emitted through 'emitAny', not through the + // FIXME: '\N' should be emitted through 'emitDot', not through the // _CharacterClassModel model. parseTest(#"\N"#, escaped(.notNewline), unsupported: true) From d6a03a08b4d63f081b88ad7683de3215777c31bd Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 09/28] Re-introduce `DSLTree.Atom.any` This time as a "true any" that matches any character, including newlines. --- Sources/_StringProcessing/ByteCodeGen.swift | 33 ++++++++++++------- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 6 ++++ Sources/_StringProcessing/Regex/DSLTree.swift | 6 +++- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 07fe09016..5e0b559cb 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -55,6 +55,9 @@ fileprivate extension Compiler.ByteCodeGen { } } switch a { + case .any: + emitAny() + case .dot: emitDot() @@ -282,23 +285,31 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitDot() { - switch (options.semanticLevel, options.dotMatchesNewline) { - case (.graphemeCluster, true): + mutating func emitAny() { + switch options.semanticLevel { + case .graphemeCluster: builder.buildAdvance(1) - case (.graphemeCluster, false): + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) builder.buildConsume { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) + input.unicodeScalars.index(after: bounds.lowerBound) } + } + } - case (.unicodeScalar, true): - // TODO: builder.buildAdvanceUnicodeScalar(1) + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + return + } + switch options.semanticLevel { + case .graphemeCluster: builder.buildConsume { input, bounds in - input.unicodeScalars.index(after: bounds.lowerBound) + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) } - case (.unicodeScalar, false): + case .unicodeScalar: builder.buildConsume { input, bounds in input[bounds.lowerBound].isNewline ? nil diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 43ec45f6c..fd29e6045 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .dot: + case .any, .dot: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index d936e73b7..0debe5059 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -1100,6 +1100,9 @@ extension DSLTree.Atom { _ printer: inout PrettyPrinter ) -> (String, canBeWrapped: Bool)? { switch self { + case .any: + return (".any", true) + case .dot: // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) @@ -1143,6 +1146,9 @@ extension DSLTree.Atom { var _regexBase: String { switch self { + case .any: + return "(?s:.)" + case .dot: return "." diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 56cec73ab..7a4e7d30e 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -246,6 +246,9 @@ extension DSLTree { case char(Character) case scalar(Unicode.Scalar) + /// Any character, including newlines. + case any + /// The DSL representation of '.' in a regex literal. This does not match /// newlines unless single line mode is enabled. case dot @@ -860,7 +863,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .dot, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .any, .dot, .backreference, .symbolicReference, + .unconverted: return true } } From da59c305fe9cd71250147fa932dbd3aba8e5245d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 10/28] Fix `CharacterClass.any` This should map to `.any`, not `.dot`. rdar://96509234 --- Sources/RegexBuilder/CharacterClass.swift | 2 +- .../_StringProcessing/ConsumerInterface.swift | 5 ++++- .../_StringProcessing/PrintAsPattern.swift | 8 ++++---- Tests/RegexBuilderTests/RegexDSLTests.swift | 20 ++++++++++++++++--- Tests/RegexTests/RenderDSLTests.swift | 17 ++++++++++++++++ 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 8b4a21fb7..4e96e510d 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -42,7 +42,7 @@ extension CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { public static var any: CharacterClass { - .init(DSLTree.CustomCharacterClass(members: [.atom(.dot)])) + .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } public static var anyGraphemeCluster: CharacterClass { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index fd29e6045..afc507c41 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .any, .dot: + case .any: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in @@ -123,6 +123,9 @@ extension DSLTree.Atom { } } + case .dot: + throw Unreachable(".atom(.dot) should be handled by emitDot") + case .assertion: // TODO: We could handle, should this be total? return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 0debe5059..9332756f1 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -896,8 +896,8 @@ extension AST.Atom { return (" /* TODO: named character */", false) case .dot: - // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. - return (".any", true) + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case .startOfLine, .endOfLine: fatalError("unreachable") @@ -1104,8 +1104,8 @@ extension DSLTree.Atom { return (".any", true) case .dot: - // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. - return (".any", true) + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case let .char(c): return (String(c)._quoted, false) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 995a67b3e..ee6bf85f9 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -69,6 +69,9 @@ class RegexDSLTests: XCTestCase { XCTAssertTrue(match.output == substringMatch.output) } + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + func testCharacterClasses() throws { try _testDSLCaptures( ("a c", ("a c", " ", "c")), @@ -111,9 +114,6 @@ class RegexDSLTests: XCTestCase { } } - let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" - let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" - // `.newlineSequence` and `.verticalWhitespace` match the same set of // newlines in grapheme semantic mode, and scalar mode when applied with // OneOrMore. @@ -243,6 +243,20 @@ class RegexDSLTests: XCTestCase { } } + func testAny() throws { + // .any matches newlines regardless of matching options. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.any) + }.dotMatchesNewlines(dotMatchesNewline) + } + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..460cc8e14 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -68,6 +68,23 @@ extension RenderDSLTests { } """) } + + func testDot() throws { + try testConversion(#".+"#, #""" + Regex { + OneOrMore { + /./ + } + } + """#) + try testConversion(#"a.c"#, #""" + Regex { + "a" + /./ + "c" + } + """#) + } func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { From 217aef4cc962ec4616378b45d15ba2b0396c320a Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:41 +0100 Subject: [PATCH 11/28] Rename `startOfLine`/`endOfLine` -> `caretAnchor`/`dollarAnchor` --- Sources/_RegexParser/Regex/AST/Atom.swift | 26 +++++++++---------- .../Regex/Parse/LexicalAnalysis.swift | 4 +-- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 +- .../_RegexParser/Regex/Printing/DumpAST.swift | 6 ++--- Sources/_StringProcessing/ByteCodeGen.swift | 4 +-- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 10 ++++--- Sources/_StringProcessing/Regex/DSLTree.swift | 4 +-- 8 files changed, 30 insertions(+), 28 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 2e39c9c4c..6d8f62c42 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -63,10 +63,10 @@ extension AST { case dot /// ^ - case startOfLine + case caretAnchor /// $ - case endOfLine + case dollarAnchor // References case backreference(Reference) @@ -105,8 +105,8 @@ extension AST.Atom { case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v case .dot: return nil - case .startOfLine: return nil - case .endOfLine: return nil + case .caretAnchor: return nil + case .dollarAnchor: return nil case .invalid: return nil } } @@ -536,10 +536,10 @@ extension AST.Atom { case notTextSegment = #"\Y"# /// ^ - case startOfLine = #"^"# + case caretAnchor = #"^"# /// $ - case endOfLine = #"$"# + case dollarAnchor = #"$"# /// \b (from outside a custom character class) case wordBoundary = #"\b"# @@ -551,8 +551,8 @@ extension AST.Atom { public var assertionKind: AssertionKind? { switch kind { - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor case .escaped(.wordBoundary): return .wordBoundary case .escaped(.notWordBoundary): return .notWordBoundary @@ -806,9 +806,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .dot, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions, .invalid: + case .scalarSequence, .property, .dot, .caretAnchor, + .dollarAnchor, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } @@ -858,7 +858,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .dot, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -874,7 +874,7 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: return false default: return true diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index d14a17785..4a4f5c05f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2074,8 +2074,8 @@ extension Parser { // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .dot - case "^": return customCC ? .char("^") : .startOfLine - case "$": return customCC ? .char("$") : .endOfLine + case "^": return customCC ? .char("^") : .caretAnchor + case "$": return customCC ? .char("$") : .dollarAnchor // Escaped case "\\": return p.expectEscaped().value diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 88744bae2..ea541fba7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .dot: + case .char, .scalar, .caretAnchor, .dollarAnchor, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 68c32e4a1..cf5a56721 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,9 +153,9 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .dot: return "." - case .startOfLine: return "^" - case .endOfLine: return "$" + case .dot: return "." + case .caretAnchor: return "^" + case .dollarAnchor: return "$" case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5e0b559cb..dd4915851 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -170,7 +170,7 @@ fileprivate extension Compiler.ByteCodeGen { !input.isOnGraphemeClusterBoundary(pos) } - case .startOfLine: + case .caretAnchor: // FIXME: Anchor.startOfLine must always use this first branch // The behavior of `^` should depend on `anchorsMatchNewlines`, but // the DSL-based `.startOfLine` anchor should always match the start @@ -192,7 +192,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - case .endOfLine: + case .dollarAnchor: // FIXME: Anchor.endOfLine must always use this first branch // The behavior of `$` should depend on `anchorsMatchNewlines`, but // the DSL-based `.endOfLine` anchor should always match the end diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index afc507c41..ae7149a00 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -272,7 +272,7 @@ extension AST.Atom { "Should have been handled by tree conversion") fatalError(".atom(.dot) is handled in emitDot") - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 9332756f1..fc257cad4 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -627,9 +627,11 @@ extension AST.Atom.AssertionKind { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { - case .startOfLine: + case .caretAnchor: + // FIXME: The DSL doesn't have a way of representing this. return "Anchor.startOfLine" - case .endOfLine: + case .dollarAnchor: + // FIXME: The DSL doesn't have a way of representing this. return "Anchor.endOfLine" case .wordBoundary: return "Anchor.wordBoundary" @@ -899,7 +901,7 @@ extension AST.Atom { // The DSL does not have an equivalent to '.', print as a regex. return ("/./", false) - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -954,7 +956,7 @@ extension AST.Atom { case .dot: return "." - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 7a4e7d30e..93954bbf9 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -801,10 +801,10 @@ extension DSLTree { : .init(ast: .textSegment) } public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .startOfLine) + .init(ast: .caretAnchor) } public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .endOfLine) + .init(ast: .dollarAnchor) } public static func wordBoundary(_ inverted: Bool = false) -> Self { inverted From dff47ff08fb463de49dec6b48a80b080fc24acf4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:41 +0100 Subject: [PATCH 12/28] Move AssertionKind onto the DSL This enum will start including cases that only the DSL can use, so move it off the AST. --- Sources/RegexBuilder/Anchor.swift | 32 +++++--- Sources/_RegexParser/Regex/AST/Atom.swift | 61 --------------- .../Regex/Printing/PrintAsCanonical.swift | 7 +- Sources/_StringProcessing/ByteCodeGen.swift | 4 +- .../_StringProcessing/PrintAsPattern.swift | 6 +- .../Regex/ASTConversion.swift | 32 +++++++- Sources/_StringProcessing/Regex/DSLTree.swift | 74 ++++++++++--------- .../Utility/RegexFactory.swift | 2 +- 8 files changed, 102 insertions(+), 116 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 28fc7e8d1..3a5c6eb6a 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -37,16 +37,30 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var baseAssertion: DSLTree._AST.AssertionKind { + var baseAssertion: DSLTree.Atom.Assertion { switch kind { - case .startOfSubject: return .startOfSubject(isInverted) - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) - case .endOfSubject: return .endOfSubject(isInverted) - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) - case .textSegmentBoundary: return .textSegmentBoundary(isInverted) - case .startOfLine: return .startOfLine(isInverted) - case .endOfLine: return .endOfLine(isInverted) - case .wordBoundary: return .wordBoundary(isInverted) + case .startOfSubject: + // FIXME: Inverted? + return .startOfSubject + case .endOfSubjectBeforeNewline: + // FIXME: Inverted? + return .endOfSubjectBeforeNewline + case .endOfSubject: + // FIXME: Inverted? + return .endOfSubject + case .firstMatchingPositionInSubject: + // FIXME: Inverted? + return .firstMatchingPositionInSubject + case .textSegmentBoundary: + return isInverted ? .notTextSegment : .textSegment + case .startOfLine: + // FIXME: Inverted? + return .caretAnchor + case .endOfLine: + // FIXME: Inverted? + return .dollarAnchor + case .wordBoundary: + return isInverted ? .notWordBoundary : .wordBoundary } } diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 6d8f62c42..b03ce8c39 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty { } } -extension AST.Atom { - /// Anchors and other built-in zero-width assertions. - public enum AssertionKind: String, Hashable { - /// \A - case startOfSubject = #"\A"# - - /// \Z - case endOfSubjectBeforeNewline = #"\Z"# - - /// \z - case endOfSubject = #"\z"# - - /// \K - case resetStartOfMatch = #"\K"# - - /// \G - case firstMatchingPositionInSubject = #"\G"# - - /// \y - case textSegment = #"\y"# - - /// \Y - case notTextSegment = #"\Y"# - - /// ^ - case caretAnchor = #"^"# - - /// $ - case dollarAnchor = #"$"# - - /// \b (from outside a custom character class) - case wordBoundary = #"\b"# - - /// \B - case notWordBoundary = #"\B"# - - } - - public var assertionKind: AssertionKind? { - switch kind { - case .caretAnchor: return .caretAnchor - case .dollarAnchor: return .dollarAnchor - - case .escaped(.wordBoundary): return .wordBoundary - case .escaped(.notWordBoundary): return .notWordBoundary - case .escaped(.startOfSubject): return .startOfSubject - case .escaped(.endOfSubject): return .endOfSubject - case .escaped(.textSegment): return .textSegment - case .escaped(.notTextSegment): return .notTextSegment - case .escaped(.endOfSubjectBeforeNewline): - return .endOfSubjectBeforeNewline - case .escaped(.firstMatchingPositionInSubject): - return .firstMatchingPositionInSubject - - case .escaped(.resetStartOfMatch): return .resetStartOfMatch - - default: return nil - } - } -} - extension AST.Atom { public enum Callout: Hashable { /// A PCRE callout written `(?C...)` diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 0e7cfb1d3..6b8c8ab93 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -237,9 +237,6 @@ extension AST.Atom.Number { extension AST.Atom { var _canonicalBase: String { - if let anchor = self.assertionKind { - return anchor.rawValue - } if let lit = self.literalStringValue { // FIXME: We may have to re-introduce escapes // For example, `\.` will come back as "." instead @@ -248,6 +245,10 @@ extension AST.Atom { return lit } switch self.kind { + case .caretAnchor: + return "^" + case .dollarAnchor: + return "$" case .escaped(let e): return "\\\(e.character)" case .backreference(let br): diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index dd4915851..5636c6d6c 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -68,7 +68,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitScalar(s) case let .assertion(kind): - try emitAssertion(kind.ast) + try emitAssertion(kind) case let .backreference(ref): try emitBackreference(ref.ast) @@ -114,7 +114,7 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitAssertion( - _ kind: AST.Atom.AssertionKind + _ kind: DSLTree.Atom.Assertion ) throws { // FIXME: Depends on API model we have... We may want to // think through some of these with API interactions in mind diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index fc257cad4..439316b4a 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -623,7 +623,7 @@ extension String { } } -extension AST.Atom.AssertionKind { +extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { @@ -811,7 +811,7 @@ extension AST.Atom { /// /// TODO: Some way to integrate this with conversion... var _patternBase: (String, canBeWrapped: Bool) { - if let anchor = self.assertionKind { + if let anchor = self.dslAssertionKind { return (anchor._patternBase, false) } @@ -1124,7 +1124,7 @@ extension DSLTree.Atom { } case .assertion(let a): - return (a.ast._patternBase, false) + return (a._patternBase, false) case .backreference(_): return ("/* TOOD: backreferences */", false) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 12068e1bc..2146fd61b 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -208,10 +208,38 @@ extension AST.CustomCharacterClass { } } +extension AST.Atom.EscapedBuiltin { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch self { + case .wordBoundary: return .wordBoundary + case .notWordBoundary: return .notWordBoundary + case .startOfSubject: return .startOfSubject + case .endOfSubject: return .endOfSubject + case .textSegment: return .textSegment + case .notTextSegment: return .notTextSegment + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject + case .resetStartOfMatch: return .resetStartOfMatch + default: return nil + } + } +} + +extension AST.Atom { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch kind { + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor + case .escaped(let b): return b.dslAssertionKind + default: return nil + } + } +} + extension AST.Atom { var dslTreeAtom: DSLTree.Atom { - if let kind = assertionKind { - return .assertion(.init(ast: kind)) + if let kind = dslAssertionKind { + return .assertion(kind) } switch self.kind { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 93954bbf9..f55c3bc01 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -253,7 +253,7 @@ extension DSLTree { /// newlines unless single line mode is enabled. case dot - case assertion(_AST.AssertionKind) + case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -263,6 +263,44 @@ extension DSLTree { } } +extension DSLTree.Atom { + @_spi(RegexBuilder) + public enum Assertion: Hashable { + /// \A + case startOfSubject + + /// \Z + case endOfSubjectBeforeNewline + + /// \z + case endOfSubject + + /// \K + case resetStartOfMatch + + /// \G + case firstMatchingPositionInSubject + + /// \y + case textSegment + + /// \Y + case notTextSegment + + /// ^ + case caretAnchor + + /// $ + case dollarAnchor + + /// \b (from outside a custom character class) + case wordBoundary + + /// \B + case notWordBoundary + } +} + extension Unicode.GeneralCategory { var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { switch self { @@ -779,40 +817,6 @@ extension DSLTree { internal var ast: AST.AbsentFunction } - @_spi(RegexBuilder) - public struct AssertionKind { - internal var ast: AST.Atom.AssertionKind - - public static func startOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .startOfSubject) - } - public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubjectBeforeNewline) - } - public static func endOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubject) - } - public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { - .init(ast: .firstMatchingPositionInSubject) - } - public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notTextSegment) - : .init(ast: .textSegment) - } - public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .caretAnchor) - } - public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .dollarAnchor) - } - public static func wordBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notWordBoundary) - : .init(ast: .wordBoundary) - } - } - @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 693b04966..31245c0f7 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -40,7 +40,7 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) public func assertion( - _ kind: DSLTree._AST.AssertionKind + _ kind: DSLTree.Atom.Assertion ) -> Regex { .init(node: .atom(.assertion(kind))) } From 1b3ba2ce6595f9d6b08dd7d0ee75cf5f2ae4f61d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 13/28] Fix `Anchor.startOfLine` and `Anchor.endOfLine` Introduce `startOfInput` and `endOfInput` assertion kinds, and map the DSL to them such that they do not depend on matching options. rdar://97029630 --- Sources/RegexBuilder/Anchor.swift | 4 +- Sources/_StringProcessing/ByteCodeGen.swift | 64 ++++++++++--------- .../_StringProcessing/PrintAsPattern.swift | 12 ++-- Sources/_StringProcessing/Regex/DSLTree.swift | 8 +++ Tests/RegexBuilderTests/RegexDSLTests.swift | 35 ++++++++-- Tests/RegexTests/RenderDSLTests.swift | 16 ++++- 6 files changed, 96 insertions(+), 43 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 3a5c6eb6a..cf1931577 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -55,10 +55,10 @@ extension Anchor: RegexComponent { return isInverted ? .notTextSegment : .textSegment case .startOfLine: // FIXME: Inverted? - return .caretAnchor + return .startOfLine case .endOfLine: // FIXME: Inverted? - return .dollarAnchor + return .endOfLine case .wordBoundary: return isInverted ? .notWordBoundary : .wordBoundary } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5636c6d6c..0e0673988 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -113,6 +113,32 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitStartOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } + } + } + + mutating func emitEndOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } + } + } + mutating func emitAssertion( _ kind: DSLTree.Atom.Assertion ) throws { @@ -170,44 +196,24 @@ fileprivate extension Compiler.ByteCodeGen { !input.isOnGraphemeClusterBoundary(pos) } + case .startOfLine: + emitStartOfLine() + + case .endOfLine: + emitEndOfLine() + case .caretAnchor: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } + emitStartOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } - + case .dollarAnchor: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } + emitEndOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 439316b4a..3e62d1886 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -627,12 +627,16 @@ extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { - case .caretAnchor: - // FIXME: The DSL doesn't have a way of representing this. + case .startOfLine: return "Anchor.startOfLine" - case .dollarAnchor: - // FIXME: The DSL doesn't have a way of representing this. + case .endOfLine: return "Anchor.endOfLine" + case .caretAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/^/" + case .dollarAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/$/" case .wordBoundary: return "Anchor.wordBoundary" case .notWordBoundary: diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f55c3bc01..549a8b3a1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -287,6 +287,14 @@ extension DSLTree.Atom { /// \Y case notTextSegment + /// The DSL's Anchor.startOfLine, which matches the start of a line + /// even if `anchorsMatchNewlines` is false. + case startOfLine + + /// The DSL's Anchor.endOfLine, which matches the end of a line + /// even if `anchorsMatchNewlines` is false. + case endOfLine + /// ^ case caretAnchor diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index ee6bf85f9..0fb96bf66 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -815,19 +815,40 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - - // FIXME: Anchor.start/endOfLine needs to always match line endings, - // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( - ("\naaa", "aaa"), - ("aaa\n", "aaa"), - ("\naaa\n", "aaa"), - matchType: Substring.self, ==, xfail: true) + ("\naaa", "\naaa"), + ("aaa\n", "aaa\n"), + ("\naaa\n", "\naaa\n"), + matchType: Substring.self, ==) { Regex { + Optionally { "\n" } Anchor.startOfLine Repeat("a", count: 3) Anchor.endOfLine + Optionally { "\n" } + } + } + + // startOfLine/endOfLine apply regardless of mode. + for matchLineEndings in [true, false] { + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + let r = Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) + + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) + XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) + + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) + XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 460cc8e14..e33b10c31 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -85,7 +85,21 @@ extension RenderDSLTests { } """#) } - + + func testAnchor() throws { + try testConversion(#"^(?:a|b|c)$"#, #""" + Regex { + /^/ + ChoiceOf { + "a" + "b" + "c" + } + /$/ + } + """#) + } + func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { try testConversion(#"(?i)abc"#, """ From 05701330a287f7f3d6e854dfdd0c7fc3b8ddb23f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 14/28] Add some tests for `CharacterClass.anyGraphemeCluster` --- Tests/RegexBuilderTests/RegexDSLTests.swift | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 0fb96bf66..cdaf7b436 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -255,6 +255,34 @@ class RegexDSLTests: XCTestCase { }.dotMatchesNewlines(dotMatchesNewline) } } + + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + try _testDSLCaptures( + ("a", "a"), + ("\r\n", "\r\n"), + ("e\u{301}", "e\u{301}"), + ("e\u{301}f", nil), + ("e\u{303}\u{301}\u{302}", "e\u{303}\u{301}\u{302}"), + matchType: Substring.self, ==) + { + Regex { + One(.anyGraphemeCluster) + }.matchingSemantics(mode) + } + + // Like `.any` it also always matches newlines. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyGraphemeCluster) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } } func testMatchResultDotZeroWithoutCapture() throws { From c7b42f8dbbba8e5fe8ea4d1f178754d85ae3325e Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 15/28] Add some tests for `CharacterClass.horizontalWhitespace` --- Tests/RegexBuilderTests/RegexDSLTests.swift | 24 +++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index cdaf7b436..212731e71 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -220,6 +220,30 @@ class RegexDSLTests: XCTestCase { }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) } } + + // Make sure horizontal whitespace does not match newlines or other + // vertical whitespace. + try _testDSLCaptures( + (" \u{A0} \u{9} \t ", " \u{A0} \u{9} \t "), + (" \n", nil), + (" \r", nil), + (" \r\n", nil), + (" \u{2028}", nil), + matchType: Substring.self, ==) + { + OneOrMore(.horizontalWhitespace) + } + + // Horizontal whitespace in ASCII mode. + try _testDSLCaptures( + (" \u{9} \t ", " \u{9} \t "), + ("\u{A0}", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.horizontalWhitespace) + }.asciiOnlyWhitespace() + } } func testCharacterClassOperations() throws { From 47888e699fb6fbbce36e54c4f3ea020b9ff2d7e6 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:43 +0100 Subject: [PATCH 16/28] Implement `CharacterClass.anyNonNewline` rdar://97029702 --- Sources/RegexBuilder/CharacterClass.swift | 4 ++ Sources/_StringProcessing/ByteCodeGen.swift | 17 ++++-- .../_StringProcessing/ConsumerInterface.swift | 16 ++++++ .../_StringProcessing/PrintAsPattern.swift | 6 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 8 ++- Tests/RegexBuilderTests/RegexDSLTests.swift | 57 +++++++++++++++++++ 6 files changed, 101 insertions(+), 7 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4e96e510d..ea52c28f3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -45,6 +45,10 @@ extension RegexComponent where Self == CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)])) + } + public static var anyGraphemeCluster: CharacterClass { .init(unconverted: ._anyGrapheme) } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 0e0673988..6263186e8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -58,6 +58,9 @@ fileprivate extension Compiler.ByteCodeGen { case .any: emitAny() + case .anyNonNewline: + emitAnyNonNewline() + case .dot: emitDot() @@ -303,11 +306,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitDot() { - if options.dotMatchesNewline { - emitAny() - return - } + mutating func emitAnyNonNewline() { switch options.semanticLevel { case .graphemeCluster: builder.buildConsume { input, bounds in @@ -324,6 +323,14 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + } else { + emitAnyNonNewline() + } + } + mutating func emitAlternation( _ children: [DSLTree.Node] ) throws { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index ae7149a00..fb9267f4f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -123,6 +123,22 @@ extension DSLTree.Atom { } } + case .anyNonNewline: + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + case .unicodeScalar: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + case .dot: throw Unreachable(".atom(.dot) should be handled by emitDot") diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 3e62d1886..21c611d43 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -1109,6 +1109,9 @@ extension DSLTree.Atom { case .any: return (".any", true) + case .anyNonNewline: + return (".anyNonNewline", true) + case .dot: // The DSL does not have an equivalent to '.', print as a regex. return ("/./", false) @@ -1155,6 +1158,9 @@ extension DSLTree.Atom { case .any: return "(?s:.)" + case .anyNonNewline: + return "(?-s:.)" + case .dot: return "." diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 549a8b3a1..449baa6a7 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -249,6 +249,10 @@ extension DSLTree { /// Any character, including newlines. case any + /// Any character, excluding newlines. This differs from '.', as it is not + /// affected by single line mode. + case anyNonNewline + /// The DSL representation of '.' in a regex literal. This does not match /// newlines unless single line mode is enabled. case dot @@ -875,8 +879,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .dot, .backreference, .symbolicReference, - .unconverted: + case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, + .symbolicReference, .unconverted: return true } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 212731e71..202c4cdeb 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -309,6 +309,63 @@ class RegexDSLTests: XCTestCase { } } + func testAnyNonNewline() throws { + // `.anyNonNewline` is `.` without single-line mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abcdef", "abcdef"), + ("abcdef\n", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abcdef", nil), + ("abcdef\n", nil), + ("\r\n", "\r\n"), + ("\r", "\r"), + ("\n", "\n"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline.inverted) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abc", "abc"), + ("abcd", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(CharacterClass.anyNonNewline.intersection(.anyOf("\n\rabc"))) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + + try _testDSLCaptures( + ("\r\n", "\r\n"), matchType: Substring.self, ==) { + CharacterClass.anyNonNewline.inverted + } + try _testDSLCaptures( + ("\r\n", nil), matchType: Substring.self, ==) { + Regex { + CharacterClass.anyNonNewline.inverted + }.matchingSemantics(.unicodeScalar) + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") From 429b699cfc367475a73ca7fdc334ca33863ebcbe Mon Sep 17 00:00:00 2001 From: Lily Date: Mon, 11 Jul 2022 13:59:21 -0700 Subject: [PATCH 17/28] Break out of quantification loop if there is no forward progress (#560) This fixes infinite loops when we loop over an internal node that does not have any forward progress. Also included is an optimization to only emit the check/break instructions if we have a case that might result in an infinite loop (possibly non-progressing inner node + unlimited quantification) --- Sources/_StringProcessing/ByteCodeGen.swift | 65 +++++++++++++++++++ .../Engine/Backtracking.swift | 10 ++- .../Engine/InstPayload.swift | 6 +- .../Engine/Instruction.swift | 18 +++++ .../_StringProcessing/Engine/MEBuilder.swift | 23 ++++++- .../_StringProcessing/Engine/Processor.swift | 16 ++++- .../_StringProcessing/Engine/Registers.swift | 14 ++++ Tests/RegexTests/CompileTests.swift | 36 ++++++++++ Tests/RegexTests/MatchTests.swift | 28 +++++++- 9 files changed, 205 insertions(+), 11 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 6263186e8..015e27c54 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -567,7 +567,12 @@ fileprivate extension Compiler.ByteCodeGen { decrement %minTrips and fallthrough loop-body: + : + mov currentPosition %pos evaluate the subexpression + : + if %pos is currentPosition: + goto exit goto min-trip-count control block exit-policy control block: @@ -670,7 +675,28 @@ fileprivate extension Compiler.ByteCodeGen { // // branch min-trip-count builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + let emitPositionChecking = + (!optimizationsEnabled || !child.guaranteesForwardProgress) && + extraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } try emitNode(child) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or extraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + if minTrips <= 1 { // fallthrough } else { @@ -856,3 +882,42 @@ fileprivate extension Compiler.ByteCodeGen { return nil } } + +extension DSLTree.Node { + var guaranteesForwardProgress: Bool { + switch self { + case .orderedChoice(let children): + return children.allSatisfy { $0.guaranteesForwardProgress } + case .concatenation(let children): + return children.contains(where: { $0.guaranteesForwardProgress }) + case .capture(_, _, let node, _): + return node.guaranteesForwardProgress + case .nonCapturingGroup(let kind, let child): + switch kind.ast { + case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: + return false + default: return child.guaranteesForwardProgress + } + case .atom(let atom): + switch atom { + case .changeMatchingOptions, .assertion: return false + default: return true + } + case .trivia, .empty: + return false + case .quotedLiteral(let string): + return !string.isEmpty + case .convertedRegexLiteral(let node, _): + return node.guaranteesForwardProgress + case .consumer, .matcher: + // Allow zero width consumers and matchers + return false + case .customCharacterClass: + return true + case .quantification(let amount, _, let child): + let (atLeast, _) = amount.ast.bounds + return atLeast ?? 0 > 0 && child.guaranteesForwardProgress + default: return false + } + } +} diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 8fcdf9312..355702ac1 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -32,15 +32,18 @@ extension Processor { // The int registers store values that can be relevant to // backtracking, such as the number of trips in a quantification. var intRegisters: [Int] + // Same with position registers + var posRegisters: [Input.Index] var destructure: ( pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, captureEnds: [_StoredCapture], - intRegisters: [Int] + intRegisters: [Int], + PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters) + (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } @@ -53,7 +56,8 @@ extension Processor { pos: addressOnly ? nil : currentPosition, stackEnd: .init(callStack.count), captureEnds: storedCaptures, - intRegisters: registers.ints) + intRegisters: registers.ints, + posRegisters: registers.positions) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..21c647a3b 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -284,10 +284,10 @@ extension Instruction.Payload { interpretPair() } - init(pos: PositionRegister, pos2: PositionRegister) { - self.init(pos, pos2) + init(addr: InstructionAddress, position: PositionRegister) { + self.init(addr, position) } - var pairedPosPos: (PositionRegister, PositionRegister) { + var pairedAddrPos: (InstructionAddress, PositionRegister) { interpretPair() } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..4cc810138 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -37,6 +37,14 @@ extension Instruction { /// case moveImmediate + /// Move the current position into a register + /// + /// moveCurrentPosition(into: PositionRegister) + /// + /// Operands: + /// - Position register to move into + case moveCurrentPosition + // MARK: General Purpose: Control flow /// Branch to a new instruction @@ -57,6 +65,16 @@ extension Instruction { /// case condBranchZeroElseDecrement + /// Conditionally branch if the current position is the same as the register + /// + /// condBranch( + /// to: InstAddr, ifSamePositionAs: PositionRegister) + /// + /// Operands: + /// - Instruction address to branch to, if the position in the register is the same as currentPosition + /// - Position register to check against + case condBranchSamePosition + // TODO: Function calls // MARK: - Matching diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..84b80489f 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -32,6 +32,7 @@ extension MEProgram { var nextIntRegister = IntRegister(0) var nextCaptureRegister = CaptureRegister(0) var nextValueRegister = ValueRegister(0) + var nextPositionRegister = PositionRegister(0) // Special addresses or instructions var failAddressToken: AddressToken? = nil @@ -105,6 +106,14 @@ extension MEProgram.Builder { fixup(to: t) } + mutating func buildCondBranch( + to t: AddressToken, + ifSamePositionAs r: PositionRegister + ) { + instructions.append(.init(.condBranchSamePosition, .init(position: r))) + fixup(to: t) + } + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) @@ -211,6 +220,10 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } + mutating func buildMoveCurrentPosition(into r: PositionRegister) { + instructions.append(.init(.moveCurrentPosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister ) { @@ -257,7 +270,8 @@ extension MEProgram.Builder { switch inst.opcode { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - + case .condBranchSamePosition: + payload = .init(addr: addr, position: inst.payload.position) case .branch, .save, .saveAddress, .clearThrough: payload = .init(addr: addr) @@ -281,6 +295,7 @@ extension MEProgram.Builder { regInfo.sequences = sequences.count regInfo.ints = nextIntRegister.rawValue regInfo.values = nextValueRegister.rawValue + regInfo.positions = nextPositionRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count regInfo.assertionFunctions = assertionFunctions.count @@ -421,6 +436,12 @@ extension MEProgram.Builder { return r } + mutating func makePositionRegister() -> PositionRegister { + let r = nextPositionRegister + defer { nextPositionRegister.rawValue += 1 } + return r + } + // TODO: A register-mapping helper struct, which could release // registers without monotonicity required diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index f7b3a65a2..d19da01e5 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -245,7 +245,7 @@ extension Processor { } mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters) = + guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure else { state = .fail @@ -259,6 +259,7 @@ extension Processor { callStack.removeLast(callStack.count - stackEnd.rawValue) storedCaptures = capEnds registers.ints = intRegisters + registers.positions = posRegisters } mutating func abort(_ e: Error? = nil) { @@ -315,7 +316,10 @@ extension Processor { registers[reg] = int controller.step() - + case .moveCurrentPosition: + let reg = payload.position + registers[reg] = currentPosition + controller.step() case .branch: controller.pc = payload.addr @@ -327,7 +331,13 @@ extension Processor { registers[int] -= 1 controller.step() } - + case .condBranchSamePosition: + let (addr, pos) = payload.pairedAddrPos + if registers[pos] == currentPosition { + controller.pc = addr + } else { + controller.step() + } case .save: let resumeAddr = payload.addr let sp = makeSavePoint(resumeAddr) diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..e5d33af8b 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -47,6 +47,8 @@ extension Processor { var ints: [Int] var values: [Any] + + var positions: [Input.Index] } } @@ -66,6 +68,12 @@ extension Processor.Registers { values[i.rawValue] = newValue } } + subscript(_ i: PositionRegister) -> Input.Index { + get { positions[i.rawValue] } + set { + positions[i.rawValue] = newValue + } + } subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } @@ -89,6 +97,8 @@ extension Processor.Registers { } extension Processor.Registers { + static let sentinelIndex = "".startIndex + init( _ program: MEProgram, _ sentinel: String.Index @@ -120,11 +130,15 @@ extension Processor.Registers { self.values = Array( repeating: SentinelValue(), count: info.values) + self.positions = Array( + repeating: Processor.Registers.sentinelIndex, + count: info.positions) } mutating func reset(sentinel: Input.Index) { self.ints._setAll(to: 0) self.values._setAll(to: SentinelValue()) + self.positions._setAll(to: Processor.Registers.sentinelIndex) } } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 4e64f7335..712808184 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -208,4 +208,40 @@ extension RegexTests { expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) } + + func testQuantificationForwardProgressCompile() { + // Unbounded quantification + non forward progressing inner nodes + // Expect to emit the position checking instructions + expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + + // Bounded quantification, don't emit position checking + expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + + // Inner node is a quantification that does not guarantee forward progress + expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + // Inner node is a quantification that guarantees forward progress + expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 8f7baf4b9..e2587368d 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1943,5 +1943,31 @@ extension RegexTests { XCTAssertEqual(matches.count, 3) } } -} + func expectCompletion(regex: String, in target: String) { + let expectation = XCTestExpectation(description: "Run the given regex to completion") + Task.init { + let r = try! Regex(regex) + let val = target.matches(of: r).isEmpty + expectation.fulfill() + return val + } + wait(for: [expectation], timeout: 3.0) + } + + func testQuantificationForwardProgress() { + expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa") + expectCompletion(regex: #"(?:\b)+"#, in: "aa") + expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa") + expectCompletion(regex: #"(a*)*"#, in: "aa") + expectCompletion(regex: #"(a?)*"#, in: "aa") + expectCompletion(regex: #"(a{,4})*"#, in: "aa") + expectCompletion(regex: #"((|)+)*"#, in: "aa") + } +} From 92a051aa4ebe38336afa27daebac3a5cba44a329 Mon Sep 17 00:00:00 2001 From: Lily Date: Tue, 12 Jul 2022 11:39:30 -0700 Subject: [PATCH 18/28] Optimize matching to match on scalar values when possible (#525) - Adds new instructions for matching characters and scalars case insensitively - Compiles ascii character matches into the faster scalar match instructions even in grapheme semantic mode - Optimizes out unnecessary runtime grapheme boundary checks for all ascii strings - Also includes fixes to scalar matching in grapheme semantic mode (#565) --- Sources/_StringProcessing/ByteCodeGen.swift | 135 ++++----- .../_StringProcessing/ConsumerInterface.swift | 103 ++++--- .../Engine/InstPayload.swift | 38 ++- .../Engine/Instruction.swift | 25 +- .../_StringProcessing/Engine/MEBuilder.swift | 26 +- .../_StringProcessing/Engine/Processor.swift | 100 ++++++- .../_StringProcessing/PrintAsPattern.swift | 56 ++-- .../Regex/ASTConversion.swift | 2 +- Sources/_StringProcessing/Regex/DSLTree.swift | 80 ------ .../Utility/AsciiBitset.swift | 99 +++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 64 +++++ Tests/RegexTests/CompileTests.swift | 259 +++++++++++++++--- Tests/RegexTests/MatchTests.swift | 47 +++- Tests/RegexTests/RenderDSLTests.swift | 30 ++ 14 files changed, 777 insertions(+), 287 deletions(-) create mode 100644 Sources/_StringProcessing/Utility/AsciiBitset.swift diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 015e27c54..477760ef8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen { emitDot() case let .char(c): - try emitCharacter(c) + emitCharacter(c) case let .scalar(s): - try emitScalar(s) + if options.semanticLevel == .graphemeCluster { + emitCharacter(Character(s)) + } else { + emitMatchScalar(s) + } case let .assertion(kind): try emitAssertion(kind) @@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitQuotedLiteral(_ s: String) { + guard options.semanticLevel == .graphemeCluster else { + for char in s { + for scalar in char.unicodeScalars { + emitMatchScalar(scalar) + } + } + return + } + + // Fast path for eliding boundary checks for an all ascii quoted literal + if optimizationsEnabled && s.allSatisfy(\.isASCII) { + let lastIdx = s.unicodeScalars.indices.last! + for idx in s.unicodeScalars.indices { + let boundaryCheck = idx == lastIdx + let scalar = s.unicodeScalars[idx] + if options.isCaseInsensitive && scalar.properties.isCased { + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + } else { + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + } + } + return + } + + for c in s { emitCharacter(c) } + } + mutating func emitBackreference( _ ref: AST.Reference ) throws { @@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitScalar(_ s: UnicodeScalar) throws { - // TODO: Native instruction buildMatchScalar(s) - if options.isCaseInsensitive { - // TODO: e.g. buildCaseInsensitiveMatchScalar(s) - builder.buildConsume(by: consumeScalar { - $0.properties.lowercaseMapping == s.properties.lowercaseMapping - }) + mutating func emitMatchScalar(_ s: UnicodeScalar) { + assert(options.semanticLevel == .unicodeScalar) + if options.isCaseInsensitive && s.properties.isCased { + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) } else { - builder.buildConsume(by: consumeScalar { - $0 == s - }) + builder.buildMatchScalar(s, boundaryCheck: false) } } - mutating func emitCharacter(_ c: Character) throws { - // Unicode scalar matches the specific scalars that comprise a character + mutating func emitCharacter(_ c: Character) { + // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { for scalar in c.unicodeScalars { - try emitScalar(scalar) + emitMatchScalar(scalar) } return } if options.isCaseInsensitive && c.isCased { - // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) - builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar - ? input.index(after: bounds.lowerBound) - : nil + if optimizationsEnabled && c.isASCII { + // c.isCased ensures that c is not CR-LF, + // so we know that c is a single scalar + assert(c.unicodeScalars.count == 1) + builder.buildMatchScalarCaseInsensitive( + c.unicodeScalars.last!, + boundaryCheck: true) + } else { + builder.buildMatch(c, isCaseInsensitive: true) } - } else { - builder.buildMatch(c) + return } + + if optimizationsEnabled && c.isASCII { + let lastIdx = c.unicodeScalars.indices.last! + for idx in c.unicodeScalars.indices { + builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + } + return + } + + builder.buildMatch(c, isCaseInsensitive: false) } mutating func emitAny() { @@ -741,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen { _ ccc: DSLTree.CustomCharacterClass ) throws { if let asciiBitset = ccc.asAsciiBitset(options), - options.semanticLevel == .graphemeCluster, optimizationsEnabled { - // future work: add a bit to .matchBitset to consume either a character - // or a scalar so we can have this optimization in scalar mode - builder.buildMatchAsciiBitset(asciiBitset) + if options.semanticLevel == .unicodeScalar { + builder.buildScalarMatchAsciiBitset(asciiBitset) + } else { + builder.buildMatchAsciiBitset(asciiBitset) + } } else { let consumer = try ccc.generateConsumer(options) builder.buildConsume(by: consumer) @@ -822,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - if options.semanticLevel == .graphemeCluster { - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() - var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) - } - return currentIndex - } - } else { - builder.buildMatchSequence(s) - } - } else { - builder.buildConsume { - [caseInsensitive = options.isCaseInsensitive] input, bounds in - // TODO: Case folding - var iterator = s.unicodeScalars.makeIterator() - var currentIndex = bounds.lowerBound - while let scalar = iterator.next() { - guard currentIndex < bounds.upperBound else { return nil } - if caseInsensitive { - if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { - return nil - } - } else { - if scalar != input.unicodeScalars[currentIndex] { - return nil - } - } - input.unicodeScalars.formIndex(after: ¤tIndex) - } - return currentIndex - } - } + emitQuotedLiteral(s) case let .convertedRegexLiteral(n, _): return try emitNode(n) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index fb9267f4f..668d16eb6 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// @@ -53,11 +60,50 @@ extension DSLTree._AST.Atom { } } +extension Character { + func generateConsumer( + _ opts: MatchingOptions + ) throws -> MEProgram.ConsumeFunction? { + let isCaseInsensitive = opts.isCaseInsensitive + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && isCased { + return input[low].lowercased() == lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == self + ? input.index(after: low) + : nil + } + } + case .unicodeScalar: + // TODO: This should only be reachable from character class emission, can + // we guarantee that? Otherwise we'd want a different matching behavior. + let consumers = unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + } + } +} + extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { - case let .char(c) where c != "\r\n": - return c.asciiValue + case let .char(c): + return c._singleScalarAsciiValue case let .scalar(s) where s.isASCII: return UInt8(ascii: s) case let .unconverted(atom): @@ -72,44 +118,15 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch self { case let .char(c): - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil - } - } - } else { - let consumers = c.unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } + return try c.generateConsumer(opts) + case let .scalar(s): - return consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - } + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + return try Character(s).generateConsumer(opts) case .any: // FIXME: Should this be a total ordering? @@ -230,16 +247,20 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { case .scalar(let s): return s.value + case .escaped(let e): + guard let s = e.scalarValue else { return nil } + return s default: return nil } } var singleScalarASCIIValue: UInt8? { + if let s = singleScalar, s.isASCII { + return UInt8(ascii: s) + } switch kind { - case let .char(c) where c != "\r\n": - return c.asciiValue - case let .scalar(s) where s.value.isASCII: - return UInt8(ascii: s.value) + case let .char(c): + return c._singleScalarAsciiValue default: return nil } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 21c647a3b..42fb86913 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -147,6 +147,26 @@ extension Instruction.Payload { var string: StringRegister { interpret() } + + init(scalar: Unicode.Scalar) { + self.init(UInt64(scalar.value)) + } + var scalar: Unicode.Scalar { + return Unicode.Scalar(_value: UInt32(self.rawValue)) + } + + init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let raw = UInt64(scalar.value) + + (caseInsensitive ? 1 << 55: 0) + + (boundaryCheck ? 1 << 54 : 0) + self.init(raw) + } + var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let caseInsensitive = (self.rawValue >> 55) & 1 == 1 + let boundaryCheck = (self.rawValue >> 54) & 1 == 1 + let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF)) + return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck) + } init(sequence: SequenceRegister) { self.init(sequence) @@ -190,18 +210,20 @@ extension Instruction.Payload { interpret() } - init(element: ElementRegister) { - self.init(element) + init(element: ElementRegister, isCaseInsensitive: Bool) { + self.init(isCaseInsensitive ? 1 : 0, element) } - var element: ElementRegister { - interpret() + var elementPayload: (isCaseInsensitive: Bool, ElementRegister) { + let pair: (UInt64, ElementRegister) = interpretPair() + return (isCaseInsensitive: pair.0 == 1, pair.1) } - init(bitset: AsciiBitsetRegister) { - self.init(bitset) + init(bitset: AsciiBitsetRegister, isScalar: Bool) { + self.init(isScalar ? 1 : 0, bitset) } - var bitset: AsciiBitsetRegister { - interpret() + var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) { + let pair: (UInt64, AsciiBitsetRegister) = interpretPair() + return (isScalar: pair.0 == 1, pair.1) } init(consumer: ConsumeFunctionRegister) { diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4cc810138..8e1a1f294 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -90,20 +90,27 @@ extension Instruction { /// Composite assert-advance else restore. /// - /// match(_: EltReg) + /// match(_: EltReg, isCaseInsensitive: Bool) /// - /// Operand: Element register to compare against. + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way case match - /// Match against a sequence of elements + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// - /// matchSequence(_: SeqReg) + /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) /// - /// Operand: Sequence register to compare against. - case matchSequence + /// Operands: Scalar value to match against and booleans + case matchScalar - /// Match against a set of valid ascii values stored in a bitset - /// Operand: Ascii bitset register containing the bitset + /// Match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value case matchBitset /// TODO: builtin assertions and anchors @@ -324,7 +331,7 @@ extension Instruction { var elementRegister: ElementRegister? { switch opcode { case .match: - return payload.element + return payload.elementPayload.1 default: return nil } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 84b80489f..0b9a91726 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -144,24 +144,32 @@ extension MEProgram.Builder { instructions.append(.init(.advance, .init(distance: n))) } - mutating func buildMatch(_ e: Character) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { instructions.append(.init( - .match, .init(element: elements.store(e)))) + .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchSequence( - _ s: S - ) where S.Element == Character { - instructions.append(.init( - .matchSequence, - .init(sequence: sequences.store(.init(s))))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) + } + + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) } + mutating func buildMatchAsciiBitset( _ b: DSLTree.CustomCharacterClass.AsciiBitset ) { instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b)))) + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + } + + mutating func buildScalarMatchAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) { + instructions.append(.init( + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } mutating func buildConsume( diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index d19da01e5..2be918294 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,15 @@ extension Processor { return true } + mutating func matchCaseInsensitive(_ e: Element) -> Bool { + guard let cur = load(), cur.lowercased() == e.lowercased() else { + signalFailure() + return false + } + _uncheckedForcedConsumeOne() + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -230,6 +239,44 @@ extension Processor { return true } + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + guard s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + + mutating func matchScalarCaseInsensitive( + _ s: Unicode.Scalar, + boundaryCheck: Bool + ) -> Bool { + guard let curScalar = loadScalar(), + s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -244,6 +291,20 @@ extension Processor { return true } + // Equivalent of matchBitset but emitted when in unicode scalar semantic mode + mutating func matchBitsetScalar( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let curScalar = loadScalar(), + bitset.matches(scalar: curScalar), + let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure @@ -379,23 +440,40 @@ extension Processor { } case .match: - let reg = payload.element - if match(registers[reg]) { - controller.step() + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + if matchCaseInsensitive(registers[reg]) { + controller.step() + } + } else { + if match(registers[reg]) { + controller.step() + } } - case .matchSequence: - let reg = payload.sequence - let seq = registers[reg] - if matchSeq(seq) { - controller.step() + case .matchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } + } else { + if matchScalar(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } } case .matchBitset: - let reg = payload.bitset + let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if matchBitset(bitset) { - controller.step() + if isScalar { + if matchBitsetScalar(bitset) { + controller.step() + } + } else { + if matchBitset(bitset) { + controller.step() + } } case .consumeBy: diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 21c611d43..80f2e7697 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -315,8 +315,7 @@ extension PrettyPrinter { return } - var charMembers = "" - + var charMembers = StringLiteralBuilder() // This iterates through all of the character class members collecting all // of the members who can be stuffed into a singular '.anyOf(...)' vs. @@ -340,14 +339,10 @@ extension PrettyPrinter { switch a { case let .char(c): charMembers.append(c) - - if c == "\\" { - charMembers.append(c) - } - return false case let .scalar(s): - charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + charMembers.append( + unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") return false case .unconverted(_): return true @@ -356,7 +351,7 @@ extension PrettyPrinter { } case let .quotedLiteral(s): - charMembers += s + charMembers.append(s) return false case .trivia(_): @@ -370,7 +365,7 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - let anyOf = ".anyOf(\(charMembers._quoted))" + let anyOf = ".anyOf(\(charMembers))" indent() @@ -393,7 +388,7 @@ extension PrettyPrinter { printer.indent() if !charMembers.isEmpty { - printer.output(".anyOf(\(charMembers._quoted))") + printer.output(".anyOf(\(charMembers))") if nonCharMembers.count > 0 { printer.output(",") @@ -617,10 +612,39 @@ extension PrettyPrinter { } extension String { - // TODO: Escaping? + fileprivate var _escaped: String { + _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#) + } + fileprivate var _quoted: String { - "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\"" + _escaped._bareQuoted + } + + fileprivate var _bareQuoted: String { + #""\#(self)""# + } +} + +/// A helper for building string literals, which handles escaping the contents +/// appended. +fileprivate struct StringLiteralBuilder { + private var contents = "" + + var result: String { contents._bareQuoted } + var isEmpty: Bool { contents.isEmpty } + + mutating func append(_ str: String) { + contents += str._escaped + } + mutating func append(_ c: Character) { + contents += String(c)._escaped } + mutating func append(unescaped str: String) { + contents += str + } +} +extension StringLiteralBuilder: CustomStringConvertible { + var description: String { result } } extension DSLTree.Atom.Assertion { @@ -1121,8 +1145,8 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return ("\\u{\(hex)}"._quoted, false) - + return ("\\u{\(hex)}"._bareQuoted, false) + case let .unconverted(a): if a.ast.isUnprintableAtom { return ("#/\(a.ast._regexBase)/#", false) @@ -1169,7 +1193,7 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}"._quoted + return "\\u{\(hex)}"._bareQuoted case let .unconverted(a): return a.ast._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 2146fd61b..c4ac8e759 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -244,7 +244,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s.value)) + case let .scalar(s): return .scalar(s.value) case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 449baa6a7..4ea905fd5 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -159,86 +159,6 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 202c4cdeb..05375a1f7 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -13,6 +13,10 @@ import XCTest import _StringProcessing import RegexBuilder +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} +#endif + class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: MatchType?)..., @@ -1391,6 +1395,66 @@ class RegexDSLTests: XCTestCase { } } + func testScalarMatching() throws { + // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In + // grapheme cluster mode, it should only match entire graphemes. It may + // match a single scalar of a grapheme cluster in scalar semantic mode. + XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNotNil("a\u{301}".firstMatch( + of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) + + let r1 = Regex { + "a" as UnicodeScalar + } + XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r2 = Regex { + CharacterClass.anyOf(["a" as UnicodeScalar, "👍"]) + } + XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r3 = Regex { + "👨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "👨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "👧" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "👦" as UnicodeScalar + } + XCTAssertNil(try r3.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦")) + + let r4 = Regex { "é" as UnicodeScalar } + XCTAssertNotNil( + try r4.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r4.firstMatch(in: "é") + ) + + try XCTExpectFailure("Need stronger scalar coalescing logic") { + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil( + try r5.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r5.firstMatch(in: "é") + ) + } + } + struct SemanticVersion: Equatable { var major: Int var minor: Int diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 712808184..6c8f66e10 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -14,6 +14,131 @@ import XCTest +enum DecodedInstr { + case invalid + case moveImmediate + case moveCurrentPosition + case branch + case condBranchZeroElseDecrement + case condBranchSamePosition + case save + case saveAddress + case splitSaving + case clear + case clearThrough + case accept + case fail + case advance + case match + case matchCaseInsensitive + case matchScalar + case matchScalarCaseInsensitiveUnchecked + case matchScalarCaseInsensitive + case matchScalarUnchecked + case matchBitsetScalar + case matchBitset + case consumeBy + case assertBy + case matchBy + case backreference + case beginCapture + case endCapture + case transformCapture + case captureValue + case builtinAssertion + case builtinCharacterClass +} + +extension DecodedInstr { + /// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions + /// like matchScalar and match into their variants + /// + /// Must stay in sync with Processor.cycle + static func decode(_ instruction: Instruction) -> DecodedInstr { + let (opcode, payload) = instruction.destructure + + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive + } else { + return .matchScalarCaseInsensitiveUnchecked + } + } else { + if boundaryCheck { + return .matchScalar + } else { + return .matchScalarUnchecked + } + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .builtinAssertion: + return .builtinAssertion + case .builtinCharacterClass: + return .builtinCharacterClass +} + } +} + extension RegexTests { private func testCompilationEquivalence( @@ -147,16 +272,24 @@ extension RegexTests { for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, - contains targets: Set, + contains targets: Set = [], + doesNotContain invalid: Set = [], file: StaticString = #file, line: UInt = #line ) { do { let prog = try _compileRegex(regex, syntax, semanticLevel) - var found: Set = [] + var found: Set = [] for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - found.insert(inst.opcode) + let decoded = DecodedInstr.decode(inst) + found.insert(decoded) + + if invalid.contains(decoded) { + XCTFail( + "Compiled regex '\(regex)' contains incorrect opcode \(decoded)", + file: file, + line: line) + return } } @@ -174,39 +307,95 @@ extension RegexTests { } } - private func expectProgram( - for regex: String, - syntax: SyntaxOptions = .traditional, - semanticLevel: RegexSemanticLevel? = nil, - doesNotContain targets: Set, - file: StaticString = #file, - line: UInt = #line - ) { - do { - let prog = try _compileRegex(regex, syntax, semanticLevel) - for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - XCTFail( - "Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)", - file: file, - line: line) - return - } - } - } catch { - XCTFail( - "Failed to compile regex '\(regex)': \(error)", - file: file, - line: line) - } - } - func testBitsetCompile() { - expectProgram(for: "[abc]", contains: [.matchBitset]) - expectProgram(for: "[abc]", doesNotContain: [.consumeBy]) + expectProgram( + for: "[abc]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[abc]", + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) + } - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) + func testScalarOptimizeCompilation() { + // all ascii quoted literal -> elide boundary checks + expectProgram( + for: "abcd", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .consumeBy]) + // ascii character -> matchScalar with boundary check + expectProgram( + for: "a", + contains: [.matchScalar], + doesNotContain: [.match, .consumeBy, .matchScalarUnchecked]) + // quoted literal is not all ascii -> match scalar when possible, always do boundary checks + expectProgram( + for: "aaa\u{301}", + contains: [.match, .matchScalar], + doesNotContain: [.consumeBy, .matchScalarUnchecked]) + // scalar mode -> always emit match scalar without boundary checks + expectProgram( + for: "abcd", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "a", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "aaa\u{301}", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + } + + func testCaseInsensitivityCompilation() { + // quoted literal is all ascii -> match scalar case insensitive and skip + // boundary checks + expectProgram( + for: "(?i)abcd", + contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive], + doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked]) + // quoted literal is all non-cased ascii -> emit match scalar instructions + expectProgram( + for: "(?i)&&&&", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .matchCaseInsensitive, + .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked]) + // quoted literal is not all ascii -> match scalar case insensitive when + // possible, match character case insensitive when needed, always perform + // boundary check + expectProgram( + for: "(?i)abcd\u{301}", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar]) + // same as before but contains ascii non cased characters -> emit matchScalar for them + expectProgram( + for: "(?i)abcd\u{301};.'!", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match]) + // contains non-ascii non-cased characters -> emit match + expectProgram( + for: "(?i)abcd\u{301};.'!💖", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + + // scalar mode -> emit unchecked scalar match only, emit case insensitive + // only if the scalar is cased + expectProgram( + for: "(?i);.'!💖", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + expectProgram( + for: "(?i)abcdé", + semanticLevel: .unicodeScalar, + contains: [.matchScalarCaseInsensitiveUnchecked], + doesNotContain: [.matchScalarUnchecked]) } func testQuantificationForwardProgressCompile() { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e2587368d..a8f7977d6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -163,7 +163,7 @@ func firstMatchTest( if xfail { XCTAssertNotEqual(found, match, file: file, line: line) } else { - XCTAssertEqual(found, match, file: file, line: line) + XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { // FIXME: This allows non-matches to succeed even when xfail'd @@ -603,6 +603,12 @@ extension RegexTests { ("A", true), ("a", false)) + matchTest(#"(?i)[a]"#, + ("💿", false), + ("a\u{301}", false), + ("A", true), + ("a", true)) + matchTest("[a]", ("a\u{301}", false)) @@ -617,14 +623,12 @@ extension RegexTests { // interpreted as matching the scalars "\r" or "\n". // It does not fully match the character "\r\n" because the character class // in scalar mode will only match one scalar - do { - let regex = try Regex("[\r\n]").matchingSemantics(.unicodeScalar) - XCTAssertEqual("\r", try regex.wholeMatch(in: "\r")?.0) - XCTAssertEqual("\n", try regex.wholeMatch(in: "\n")?.0) - XCTAssertEqual(nil, try regex.wholeMatch(in: "\r\n")?.0) - } catch { - XCTFail("\(error)", file: #filePath, line: #line) - } + matchTest( + "^[\r\n]$", + ("\r", true), + ("\n", true), + ("\r\n", false), + semanticLevel: .unicodeScalar) matchTest("[^\r\n]", ("\r\n", false), @@ -632,7 +636,17 @@ extension RegexTests { ("\r", true)) matchTest("[\n\r]", ("\n", true), - ("\r", true)) + ("\r", true), + ("\r\n", false)) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" @@ -1903,6 +1917,19 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + // Testing the matchScalar optimization for ascii quoted literals and characters + func testScalarOptimization() throws { + // check that we are correctly doing the boundary check after matchScalar + firstMatchTest("a", input: "a\u{301}", match: nil) + firstMatchTest("aa", input: "aa\u{301}", match: nil) + + firstMatchTest("a", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + firstMatchTest("aa", input: "aa\u{301}", match: "aa", semanticLevel: .unicodeScalar) + + // case insensitive tests + firstMatchTest(#"(?i)abc\u{301}d"#, input: "AbC\u{301}d", match: "AbC\u{301}d", semanticLevel: .unicodeScalar) + } + func testCase() { let regex = try! Regex(#".\N{SPARKLING HEART}."#) let input = "🧟‍♀️💖🧠 or 🧠💖☕️" diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index e33b10c31..3b0a8d5b3 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -148,4 +148,34 @@ extension RenderDSLTests { } """#) } + + func testScalar() throws { + try testConversion(#"\u{B4}"#, #""" + Regex { + "\u{B4}" + } + """#) + try testConversion(#"\u{301}"#, #""" + Regex { + "\u{301}" + } + """#) + try testConversion(#"[\u{301}]"#, #""" + Regex { + One(.anyOf("\u{301}")) + } + """#) + try testConversion(#"[abc\u{301}]"#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + // TODO: We ought to try and preserve the scalar syntax here. + try testConversion(#"a\u{301}"#, #""" + Regex { + "á" + } + """#) + } } From 33a937cf5353375a2fb46a86326f1d40ceaf9e3b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 7 Jul 2022 11:47:53 +0100 Subject: [PATCH 19/28] Validate optimizations when a match fails This allows us to catch the case where a match occurs without optimizations, but doesn't occur with optimizations. Additionally fix the `xfail` param such that it can't be used on tests that actually match expectations. --- Tests/RegexTests/MatchTests.swift | 82 ++++++++++++++++--------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a8f7977d6..8ab96c33e 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -26,23 +26,33 @@ func _firstMatch( validateOptimizations: Bool, semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional -) throws -> (String, [String?]) { +) throws -> (String, [String?])? { var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) - guard let result = try regex.firstMatch(in: input) else { - throw MatchError("match not found for \(regexStr) in \(input)") - } - let caps = result.output.slices(from: input) - + let result = try regex.firstMatch(in: input) + if validateOptimizations { regex._setCompilerOptionsForTesting(.disableOptimizations) - guard let unoptResult = try regex.firstMatch(in: input) else { + let unoptResult = try regex.firstMatch(in: input) + if result != nil && unoptResult == nil { throw MatchError("match not found for unoptimized \(regexStr) in \(input)") } - XCTAssertEqual( - String(input[result.range]), - String(input[unoptResult.range]), - "Unoptimized regex returned a different result") + if result == nil && unoptResult != nil { + throw MatchError("match not found in optimized \(regexStr) in \(input)") + } + if let result = result, let unoptResult = unoptResult { + let optMatch = String(input[result.range]) + let unoptMatch = String(input[unoptResult.range]) + if optMatch != unoptMatch { + throw MatchError(""" + + Unoptimized regex returned: '\(unoptMatch)' + Optimized regex returned: '\(optMatch)' + """) + } + } } + guard let result = result else { return nil } + let caps = result.output.slices(from: input) return (String(input[result.range]), caps.map { $0.map(String.init) }) } @@ -153,12 +163,12 @@ func firstMatchTest( line: UInt = #line ) { do { - let (found, _) = try _firstMatch( + let found = try _firstMatch( regex, input: input, validateOptimizations: validateOptimizations, semanticLevel: semanticLevel, - syntax: syntax) + syntax: syntax)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) @@ -166,9 +176,7 @@ func firstMatchTest( XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { - // FIXME: This allows non-matches to succeed even when xfail'd - // When xfail == true, this should report failure for match == nil - if !xfail && match != nil { + if !xfail { XCTFail("\(error)", file: file, line: line) } return @@ -428,8 +436,7 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil), - xfail: true) + ("bb", nil)) firstMatchTests( "a+?a", ("babc", nil), @@ -505,23 +512,19 @@ extension RegexTests { ("baabc", nil), ("bb", nil)) - // XFAIL'd versions of the above firstMatchTests( "a{2,4}+a", - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), - ("baaaaaaaabc", nil), - xfail: true) + ("baaaaaaaabc", nil)) // XFAIL'd possessive tests firstMatchTests( @@ -773,6 +776,11 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + // FIXME: This produces a different result with and without optimizations. + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, xfail: true) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, + validateOptimizations: false) + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", @@ -1118,8 +1126,8 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both - ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match - xfail: true) + ("Cafe\u{301}", nil)) // but scalar mode requires boundary at end of match + firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both ("Sol Cafe", "e")) // standalone is okay @@ -1711,19 +1719,15 @@ extension RegexTests { firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) // FIXME: Implicit \y at end of match - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) - // FIXME: \y is unsupported - firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil) // FIXME: Unicode scalars are only matched at the start of a grapheme cluster firstMatchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", xfail: true) - // FIXME: \y is unsupported - firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, - xfail: true) + + firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil) } func testCanonicalEquivalence() throws { @@ -1781,13 +1785,11 @@ extension RegexTests { // \s firstMatchTest(#"\s"#, input: " ", match: " ") // FIXME: \s shouldn't match a number composed with a non-number character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) // \p{Whitespace} firstMatchTest(#"\s"#, input: " ", match: " ") - // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + // \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) } func testCanonicalEquivalenceCustomCharacterClass() throws { From e3435543dee42fd44b4b4ee26750f06869db9d48 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:55 +0100 Subject: [PATCH 20/28] Guard against testing with older stdlibs Replace a couple of `#if os(Linux)` checks with a check to see if we have a newer stdlib available. This lets us emit an expected failure in the case where we're testing on an older stdlib. --- Package.swift | 6 ++-- Sources/TestSupport/TestSupport.swift | 33 +++++++++++++++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 5 +--- Tests/RegexTests/MatchTests.swift | 27 +++++++++-------- 4 files changed, 52 insertions(+), 19 deletions(-) create mode 100644 Sources/TestSupport/TestSupport.swift diff --git a/Package.swift b/Package.swift index abc895813..c1e9bff37 100644 --- a/Package.swift +++ b/Package.swift @@ -75,15 +75,17 @@ let package = Package( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], swiftSettings: publicStdlibSettings), + .target(name: "TestSupport", + swiftSettings: [availabilityDefinition]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"], + dependencies: ["_StringProcessing", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", - dependencies: ["_StringProcessing", "RegexBuilder"], + dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift new file mode 100644 index 000000000..b60adb63f --- /dev/null +++ b/Sources/TestSupport/TestSupport.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +// We need to split this out of the test files, as it needs to be compiled +// *without* `-disable-availability-checking` to ensure the #available check is +// not compiled into a no-op. + +#if os(Linux) +public func XCTExpectFailure( + _ message: String? = nil, body: () throws -> Void +) rethrows {} +#endif + +/// Guards certain tests to make sure we have a new stdlib available. +public func ensureNewStdlib( + file: StaticString = #file, line: UInt = #line +) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { + XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) } + return false + } + return true +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 05375a1f7..d95d4ce2c 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -12,10 +12,7 @@ import XCTest import _StringProcessing import RegexBuilder - -#if os(Linux) -func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} -#endif +import TestSupport class RegexDSLTests: XCTestCase { func _testDSLCaptures( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 8ab96c33e..9affdf6ae 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -12,6 +12,7 @@ import XCTest @testable import _RegexParser @testable import _StringProcessing +import TestSupport struct MatchError: Error { var message: String @@ -1046,6 +1047,9 @@ extension RegexTests { } func testMatchAnchors() throws { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Anchors firstMatchTests( #"^\d+"#, @@ -1094,8 +1098,6 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1113,7 +1115,6 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) -#endif // TODO: \G and \K do { @@ -1144,9 +1145,10 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) func testLevel2WordBoundaries() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Level 2 Word Boundaries firstMatchTest(#"\b😊\b"#, input: "🔥😊👍", match: "😊") firstMatchTest(#"\b👨🏽\b"#, input: "👩🏻👶🏿👨🏽🧑🏾👩🏼", match: "👨🏽") @@ -1162,8 +1164,7 @@ extension RegexTests { firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") firstMatchTest(#"\b÷\b"#, input: "3 ÷ 3 = 1", match: "÷") } -#endif - + func testMatchGroups() { // MARK: Groups @@ -1388,6 +1389,9 @@ extension RegexTests { } func testMatchExamples() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // Backreferences matchTest( #"(sens|respons)e and \1ibility"#, @@ -1437,8 +1441,6 @@ extension RegexTests { xfail: true ) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1456,7 +1458,6 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) -#endif // Floats flatCaptureTest( @@ -1536,6 +1537,9 @@ extension RegexTests { } func testASCIIClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // 'D' ASCII-only digits matchTest( #"\d+"#, @@ -1564,8 +1568,6 @@ extension RegexTests { ("aeiou", true), ("åe\u{301}ïôú", false)) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1581,7 +1583,6 @@ extension RegexTests { ("abcd ef", true), ("abcdef", false), ("abcdéf", false)) -#endif // 'S' ASCII-only spaces matchTest( From 1acb82adf9be5cc73ccfae81607c7be9c12543bf Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:56 +0100 Subject: [PATCH 21/28] Add some extra character class newline matching tests --- Tests/RegexTests/MatchTests.swift | 70 ++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 9affdf6ae..b2e8aa247 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -320,8 +320,6 @@ extension RegexTests { input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") - firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") - // MARK: Quotes firstMatchTest( @@ -1473,6 +1471,74 @@ extension RegexTests { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + + func testMatchNewlines() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + firstMatchTest( + #"\r\n"#, input: "\r\n", match: "\r\n", + semanticLevel: semantics + ) + firstMatchTest( + #"\r\n"#, input: "\n", match: nil, semanticLevel: semantics) + firstMatchTest( + #"\r\n"#, input: "\r", match: nil, semanticLevel: semantics) + + // \r\n is not treated as ASCII. + firstMatchTest( + #"^\p{ASCII}$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\r$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\r]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\n$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\n]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\u{0}-\u{7F}]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + + let scalarSemantics = semantics == .unicodeScalar + firstMatchTest( + #"\p{ASCII}"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\r"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\r]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\n"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\n]"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\u{0}-\u{7F}]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + } + } func testCaseSensitivity() { matchTest( From f56ac1126805477dae1f003630b51a02d5d001f7 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 19 Jul 2022 11:41:57 +0100 Subject: [PATCH 22/28] Fix character class range matching Previously we performed a lexicographic comparison with the bounds of a character class range. However this produced surprising results, and our implementation didn't properly handle case sensitivity. Update the logic to instead only allow single scalar NFC bounds. The input is then converted to NFC in grapheme semantic mode, and checked against the range. In scalar semantic mode, the input scalar is checked on its own. Additionally, fix the case sensitivity handling such that we check both the lowercase and uppercase version of the input against the range. --- Sources/_RegexParser/Regex/AST/Atom.swift | 6 +- Sources/_RegexParser/Utility/Misc.swift | 15 ++ .../_StringProcessing/ConsumerInterface.swift | 70 +++++--- .../Unicode/CharacterProps.swift | 7 - Sources/_StringProcessing/Unicode/NFC.swift | 55 ++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 9 + Tests/RegexTests/MatchTests.swift | 164 +++++++++++++----- Tests/RegexTests/ParseTests.swift | 41 ++++- Tests/RegexTests/UTS18Tests.swift | 7 + 9 files changed, 299 insertions(+), 75 deletions(-) create mode 100644 Sources/_StringProcessing/Unicode/NFC.swift diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index b03ce8c39..8706327f7 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -755,8 +755,10 @@ extension AST.Atom { /// Whether this atom is valid as the operand of a custom character class /// range. public var isValidCharacterClassRangeBound: Bool { - // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if let c = literalCharacterValue { + // We only match character range bounds that are single scalar NFC. + return c.hasExactlyOneScalar && c.isNFC + } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..70dc7a7d5 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,21 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + let scalars = unicodeScalars + return scalars.index(after: scalars.startIndex) == scalars.endIndex + } + + /// Whether the given character is in NFC form. + internal var isNFC: Bool { + if isASCII { return true } + let str = String(self) + return str._nfcCodeUnits.elementsEqual(str.utf8) + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 668d16eb6..c96775500 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { throw Unsupported("\(high) in range") } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil + let isCaseInsensitive = opts.isCaseInsensitive + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + + return { input, bounds in + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + + // Under grapheme semantics, we compare based on single NFC scalars. If + // such a character is not single scalar under NFC, the match fails. In + // scalar semantics, we compare the exact scalar value to the NFC + // bounds. + let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar + : input.unicodeScalars[curIdx] + guard let scalar = scalar else { return nil } + let scalarRange = lhs ... rhs + if scalarRange.contains(scalar) { + return nextIndex } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + + // Check for case insensitive matches. + func matchesCased( + _ cased: (UnicodeScalar.Properties) -> String + ) -> Bool { + let casedStr = cased(scalar.properties) + // In character semantic mode, we need to map to NFC. In scalar + // semantics, we should have an exact scalar. + let mapped = isCharacterSemantic ? casedStr.singleNFCScalar + : casedStr.singleScalar + guard let mapped = mapped else { return false } + return scalarRange.contains(mapped) + } + if isCaseInsensitive { + if scalar.properties.changesWhenLowercased, + matchesCased(\.lowercaseMapping) { + return nextIndex + } + if scalar.properties.changesWhenUppercased, + matchesCased(\.uppercaseMapping) { + return nextIndex } - return nil } + return nil } case let .custom(ccc): diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift new file mode 100644 index 000000000..5c2c4aa48 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension UnicodeScalar { + /// Checks whether the scalar is in NFC form. + var isNFC: Bool { Character(self).singleNFCScalar == self } +} + +extension Character { + /// If the given character consists of a single NFC scalar, returns it. If + /// there are multiple NFC scalars, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + // SwiftStdlib is always >= 5.7 for a shipped StringProcessing. + guard #available(SwiftStdlib 5.7, *) else { return nil } + var nfcIter = String(self)._nfc.makeIterator() + guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil } + return scalar + } + + /// If the given character contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + hasExactlyOneScalar ? unicodeScalars.first! : nil + } +} + +extension String { + /// If the given string consists of a single NFC scalar, returns it. If none + /// or multiple NFC scalars are present, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + guard !isEmpty && index(after: startIndex) == endIndex else { return nil } + return first!.singleNFCScalar + } + + /// If the given string contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + let scalars = unicodeScalars + guard !scalars.isEmpty && + scalars.index(after: scalars.startIndex) == scalars.endIndex + else { return nil } + return scalars.first! + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index d95d4ce2c..1d186e0bc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -74,6 +74,9 @@ class RegexDSLTests: XCTestCase { let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" func testCharacterClasses() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -248,6 +251,9 @@ class RegexDSLTests: XCTestCase { } func testCharacterClassOperations() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -591,6 +597,9 @@ class RegexDSLTests: XCTestCase { } func testQuantificationBehavior() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index b2e8aa247..0a0abfc92 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -570,6 +570,9 @@ extension RegexTests { } func testMatchCharacterClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Character classes firstMatchTest(#"abc\d"#, input: "xyzabc123", match: "abc1") @@ -775,10 +778,14 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") - // FIXME: This produces a different result with and without optimizations. - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, xfail: true) - firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil, - validateOptimizations: false) + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil) + firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil) + + firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}") // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { @@ -833,6 +840,35 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + for ch in "abcABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[a-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[A-CD]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcd" { + let input = String(ch) + firstMatchTest( + "[X-cd]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcxyzABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[X-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[X-cD]", input: input, match: input, semanticLevel: semantics) + } + } } func testCharacterProperties() { @@ -1164,6 +1200,9 @@ extension RegexTests { } func testMatchGroups() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Groups // Named captures @@ -1541,6 +1580,9 @@ extension RegexTests { } func testCaseSensitivity() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"c..e"#, ("cafe", true), @@ -1774,6 +1816,9 @@ extension RegexTests { var eComposed: String { "é" } var eDecomposed: String { "e\u{301}" } + var eComposedUpper: String { "É" } + var eDecomposedUpper: String { "E\u{301}" } + func testIndividualScalars() { // Expectation: A standalone Unicode scalar value in a regex literal // can match either that specific scalar value or participate in matching @@ -1860,31 +1905,62 @@ extension RegexTests { } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. - + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. + + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"[áéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + for input in [eDecomposed, eComposed] { + // Unicode scalar semantics means that only the decomposed version can + // match here. + let match = input.unicodeScalars.count == 2 ? input : nil + firstMatchTest( + #"e[\u{301}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"e[\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e-e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[a-z][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + } + for input in [eComposed, eDecomposed] { + // Grapheme cluster semantics means that we can't match the 'e' separately + // from the accent. + firstMatchTest(#"e[\u{301}]$"#, input: input, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e-e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: input, match: nil) + + // A range that covers é (U+E9). Inputs are mapped to NFC, so match. + firstMatchTest(#"[\u{E8}-\u{EA}]"#, input: input, match: input) + } - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // A range that covers É (U+C9). Inputs are mapped to NFC, so match. + for input in [eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"[\u{C9}-\u{C9}]"#, input: input, match: input) + } + // Case insensitive matching of É (U+C9). + for input in [eComposed, eDecomposed, eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"(?i)[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"(?i)[\u{C9}-\u{C9}]"#, input: input, match: input) + } let flag = "🇰🇷" firstMatchTest(#"🇰🇷"#, input: flag, match: flag) @@ -1893,27 +1969,33 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest( + #"^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of N regional indicators + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]+$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[🇦🇫-🇿🇼]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil, + semanticLevel: .unicodeScalar + ) } func testAnyChar() throws { diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 52a272915..f5e93c2bd 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -374,10 +374,21 @@ extension RegexTests { // MARK: Allowed combining characters - parseTest("e\u{301}", "e\u{301}") parseTest("1\u{358}", "1\u{358}") parseTest(#"\ \#u{361}"#, " \u{361}") + parseTest("e\u{301}", "e\u{301}") + parseTest("[e\u{301}]", charClass("e\u{301}")) + parseTest("\u{E9}", "e\u{301}") + parseTest("[\u{E9}]", charClass("e\u{301}")) + + parseTest( + "\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}")) + parseTest( + "[\\e\u{301}]", charClass("e\u{301}"), + throwsError: .invalidEscape("e\u{301}") + ) + // MARK: Alternations parseTest( @@ -2885,6 +2896,34 @@ extension RegexTests { diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand) + + // Not single-scalar NFC. + diagnosticTest("[e\u{301}-e\u{302}]", .invalidCharacterClassRangeOperand) + + // These scalar values expand under NFC. + let nfcExpandingScalars: [UInt32] = [ + 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC, + 0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D, + 0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78, + 0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F, + 0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32, + 0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C, + 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49, + 0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160, + 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE, + 0x1D1BF, 0x1D1C0 + ] + for scalar in nfcExpandingScalars { + let hex = String(scalar, radix: 16) + diagnosticTest( + #"[\u{\#(hex)}-\u{\#(hex)}]"#, .invalidCharacterClassRangeOperand) + } + + // The NFC form of U+2126 is U+3A9. + diagnosticTest(#"[\u{2126}-\u{2126}]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index aa3639ea6..11479bfb6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -21,6 +21,7 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +import TestSupport extension UnicodeScalar { var value4Digits: String { @@ -316,6 +317,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) @@ -388,6 +392,9 @@ extension UTS18Tests { } func testCharacterClassesWithStrings() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) XCTAssertEqual("🇧🇫", "🇧🇫".wholeMatch(of: regex)?.0) From 6523a93e7abba886f43f9cfe6324a98554546193 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:03 +0100 Subject: [PATCH 23/28] Coalesce adjacent scalars and characters in the DSL Previously we would emit a series of scalars written in the DSL as a series of individual characters in grapheme semantic mode. Change the behavior such that we coalesce any adjacent scalars and characters, including those in regex literals and nested concatenations. We then perform grapheme breaking over the result, and can emit character matches for scalars that coalesced into a grapheme. This transform subsumes a similar transform we performed for regex literals when converting them to a DSLTree. This has the nice side effect of allowing us to better preserve scalar syntax in the DSL transform. rdar://96942688 --- Sources/_StringProcessing/ByteCodeGen.swift | 39 +++++++- .../_StringProcessing/PrintAsPattern.swift | 96 +++++++++++++------ .../Regex/ASTConversion.swift | 63 +----------- Sources/_StringProcessing/Regex/DSLTree.swift | 8 ++ Sources/_StringProcessing/Utility/Misc.swift | 47 +++++++++ Tests/RegexBuilderTests/RegexDSLTests.swift | 77 ++++++++++++--- Tests/RegexTests/MatchTests.swift | 37 +++++++ Tests/RegexTests/RenderDSLTests.swift | 53 +++++++++- 8 files changed, 318 insertions(+), 102 deletions(-) create mode 100644 Sources/_StringProcessing/Utility/Misc.swift diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 477760ef8..da21ea26a 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -791,6 +791,41 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { + // Before emitting a concatenation, we need to flatten out any nested + // concatenations, and coalesce any adjacent characters and scalars, forming + // quoted literals of their contents, over which we can perform grapheme + // breaking. + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .convertedRegexLiteral(let n, _): + return flatten(n) + default: + return [node] + } + } + let children = children + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + default: + return false + } + } + for child in children { + try emitConcatenationComponent(child) + } + } + @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { @@ -799,9 +834,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAlternation(children) case let .concatenation(children): - for child in children { - try emitConcatenationComponent(child) - } + try emitConcatenation(children) case let .capture(name, refId, child, transform): options.beginScope() diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 80f2e7697..9035d2f51 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -70,16 +70,9 @@ extension PrettyPrinter { for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } - - switch node { - case .concatenation(_): - printAsPattern(convertedFromAST: node) - case .convertedRegexLiteral(.concatenation(_), _): - printAsPattern(convertedFromAST: node) - default: - printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node) - } + + printBlock("Regex") { printer in + printer.printAsPattern(convertedFromAST: node, isTopLevel: true) } } @@ -89,7 +82,7 @@ extension PrettyPrinter { // to have a non-backing-off pretty-printer that this // can defer to. private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node + convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false ) { if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) @@ -106,11 +99,7 @@ extension PrettyPrinter { } case let .concatenation(c): - printBlock("Regex") { printer in - c.forEach { - printer.printAsPattern(convertedFromAST: $0) - } - } + printConcatenationAsPattern(c, isTopLevel: isTopLevel) case let .nonCapturingGroup(kind, child): switch kind.ast { @@ -263,7 +252,7 @@ extension PrettyPrinter { // check above, so it should work out. Need a // cleaner way to do this. This means the argument // label is a lie. - printAsPattern(convertedFromAST: n) + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -279,6 +268,60 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } + + enum NodeToPrint { + case dslNode(DSLTree.Node) + case stringLiteral(String) + } + + mutating func printAsPattern(_ node: NodeToPrint) { + switch node { + case .dslNode(let n): + printAsPattern(convertedFromAST: n) + case .stringLiteral(let str): + print(str) + } + } + + mutating func printConcatenationAsPattern( + _ nodes: [DSLTree.Node], isTopLevel: Bool + ) { + // We need to coalesce any adjacent character and scalar elements into a + // string literal, preserving scalar syntax. + let nodes = nodes + .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } + .coalescing( + with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } + ) { literal, node in + guard case .dslNode(let node) = node else { return false } + switch node { + case let .atom(.char(c)): + literal.append(c) + return true + case let .atom(.scalar(s)): + literal.append(unescaped: s._dslBase) + return true + case .quotedLiteral(let q): + literal.append(q) + return true + default: + return false + } + } + if isTopLevel || nodes.count == 1 { + // If we're at the top level, or we coalesced everything into a single + // element, we don't need to print a surrounding Regex { ... }. + for n in nodes { + printAsPattern(n) + } + return + } + printBlock("Regex") { printer in + for n in nodes { + printer.printAsPattern(n) + } + } + } mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, @@ -341,8 +384,7 @@ extension PrettyPrinter { charMembers.append(c) return false case let .scalar(s): - charMembers.append( - unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") + charMembers.append(unescaped: s._dslBase) return false case .unconverted(_): return true @@ -449,9 +491,9 @@ extension PrettyPrinter { case let .scalar(s): if wrap { - output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + output("One(.anyOf(\(s._dslBase._bareQuoted)))") } else { - output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + output(".anyOf(\(s._dslBase._bareQuoted))") } case let .unconverted(a): @@ -625,6 +667,10 @@ extension String { } } +extension UnicodeScalar { + var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" } +} + /// A helper for building string literals, which handles escaping the contents /// appended. fileprivate struct StringLiteralBuilder { @@ -851,19 +897,15 @@ extension AST.Atom { } var _dslBase: (String, canBeWrapped: Bool) { - func scalarLiteral(_ s: UnicodeScalar) -> String { - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - } switch kind { case let .char(c): return (String(c), false) case let .scalar(s): - return (scalarLiteral(s.value), false) + return (s.value._dslBase, false) case let .scalarSequence(seq): - return (seq.scalarValues.map(scalarLiteral).joined(), false) + return (seq.scalarValues.map(\._dslBase).joined(), false) case let .property(p): return (p._dslBase, true) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index c4ac8e759..4eb7bc42c 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -43,61 +43,7 @@ extension AST.Node { return .orderedChoice(children) case let .concatenation(v): - // Coalesce adjacent children who can produce a - // string literal representation - let astChildren = v.children - func coalesce( - _ idx: Array.Index - ) -> (Array.Index, String)? { - var result = "" - var idx = idx - while idx < astChildren.endIndex { - guard let atom: AST.Atom = astChildren[idx].as() else { break } - - // TODO: For printing, nice to coalesce - // scalars literals too. We likely need a different - // approach even before we have a better IR. - if let char = atom.singleCharacter { - result.append(char) - } else if let scalar = atom.singleScalar { - result.append(Character(scalar)) - } else if case .scalarSequence(let seq) = atom.kind { - result += seq.scalarValues.map(Character.init) - } else { - break - } - - astChildren.formIndex(after: &idx) - } - return result.isEmpty ? nil : (idx, result) - } - - // No need to nest single children concatenations - if astChildren.count == 1 { - return astChildren.first!.dslTreeNode - } - - // Check for a single child post-coalescing - if let (idx, str) = coalesce(astChildren.startIndex), - idx == astChildren.endIndex - { - return .quotedLiteral(str) - } - - // Coalesce adjacent string children - var curIdx = astChildren.startIndex - var children = Array() - while curIdx < astChildren.endIndex { - if let (nextIdx, str) = coalesce(curIdx) { - // TODO: Track source info... - children.append(.quotedLiteral(str)) - curIdx = nextIdx - } else { - children.append(astChildren[curIdx].dslTreeNode) - astChildren.formIndex(after: &curIdx) - } - } - return .concatenation(children) + return .concatenation(v.children.map(\.dslTreeNode)) case let .group(v): let child = v.child.dslTreeNode @@ -135,10 +81,9 @@ extension AST.Node { case let .atom(v): switch v.kind { case .scalarSequence(let seq): - // Scalar sequences are splatted into concatenated scalars, which - // becomes a quoted literal. Sequences nested in concatenations have - // already been coalesced, this just handles the lone atom case. - return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) default: return .atom(v.dslTreeAtom) } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 4ea905fd5..520f4991a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -334,6 +334,14 @@ extension DSLTree.Node { default: return nil } } + + /// If this node is for a converted literal, look through it. + var lookingThroughConvertedLiteral: Self { + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } + } } extension DSLTree.Atom { diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift new file mode 100644 index 000000000..139a1be34 --- /dev/null +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension Array { + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + var didAccumulate = false + var accumulator = initialAccumulator + + var result = Self() + for elt in self { + if accumulate(&accumulator, elt) { + // The element has been coalesced into accumulator, there is nothing + // else to do. + didAccumulate = true + continue + } + if didAccumulate { + // We have a leftover accumulator, which needs to be finished before we + // can append the next element. + result.append(finish(accumulator)) + accumulator = initialAccumulator + didAccumulate = false + } + result.append(elt) + } + // Handle a leftover accumulation. + if didAccumulate { + result.append(finish(accumulator)) + } + return result + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1d186e0bc..e25f2df05 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1435,7 +1435,8 @@ class RegexDSLTests: XCTestCase { "\u{200D}" as UnicodeScalar "👦" as UnicodeScalar } - XCTAssertNil(try r3.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r3.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r3.wholeMatch(in: "👨‍👨‍👧‍👦")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦")) XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦")) @@ -1447,18 +1448,72 @@ class RegexDSLTests: XCTestCase { try r4.firstMatch(in: "é") ) - try XCTExpectFailure("Need stronger scalar coalescing logic") { - let r5 = Regex { - "e" - "\u{301}" as UnicodeScalar + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r5.firstMatch(in: "é")) + + let r6 = Regex { + "abcde" + "\u{301}" + } + XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) + XCTAssertNotNil(try r6.firstMatch(in: "abcdé")) + + let r7 = Regex { + "e" as Character + "\u{301}" as Character + } + XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r7.firstMatch(in: "é")) + + // You can't match a partial grapheme in grapheme semantic mode. + let r8 = Regex { + "👨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "👨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "👧" as UnicodeScalar + } + XCTAssertNil(try r8.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNil(try r8.wholeMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦")) + + // Scalar coalescing occurs across nested concatenations and literals. + let r9 = Regex { + Regex { + try! Regex(#"👨"#) + "\u{200D}" as UnicodeScalar + Regex { + "👨" as UnicodeScalar + } } - XCTAssertNotNil( - try r5.firstMatch(in: "e\u{301}") - ) - XCTAssertNotNil( - try r5.firstMatch(in: "é") - ) + Regex { + Regex { + "\u{200D}" as UnicodeScalar + "👧" + } + try! Regex(#"\u{200D}👦"#) + } + } + XCTAssertNotNil(try r9.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r9.wholeMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦")) + + let r10 = Regex { + "👨" as UnicodeScalar + try! Regex(#"\u{200D 1F468 200D 1F467}"#) + "\u{200D}" as UnicodeScalar + "👦" as UnicodeScalar } + XCTAssertNotNil(try r10.firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r10.wholeMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "👨‍👨‍👧‍👦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "👨‍👨‍👧‍👦")) } struct SemanticVersion: Equatable { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 0a0abfc92..7d65cd132 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -312,6 +312,38 @@ extension RegexTests { match: "\u{006f}\u{031b}\u{0323}" ) + // e + combining accents + firstMatchTest( + #"e\u{301 302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{315 35C 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\u{302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{315}\u{301}\u{35C}", + match: "e\u{315}\u{301}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\de\u{302}"#, + input: "e\u{301}0e\u{302}", + match: "e\u{301}0e\u{302}" + ) + // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", @@ -1861,6 +1893,11 @@ extension RegexTests { #"e$"#, (eComposed, false), (eDecomposed, false)) + + matchTest( + #"\u{65 301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 3b0a8d5b3..65a9422f2 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -171,10 +171,59 @@ extension RenderDSLTests { } """#) - // TODO: We ought to try and preserve the scalar syntax here. try testConversion(#"a\u{301}"#, #""" Regex { - "á" + "a\u{301}" + } + """#) + + try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #""" + Regex { + "👨\u{200D}👨\u{200D}👧\u{200D}👦" + } + """#) + + try testConversion(#"(👨\u{200D}👨)\u{200D}👧\u{200D}👦"#, #""" + Regex { + Capture { + "👨\u{200D}👨" + } + "\u{200D}👧\u{200D}👦" + } + """#) + + // We preserve the structure of non-capturing groups. + try testConversion(#"abcd(?:e\u{301}\d)"#, #""" + Regex { + "abcd" + Regex { + "e\u{301}" + One(.digit) + } + } + """#) + + try testConversion(#"\u{A B C}"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + } + """#) + + // TODO: We might want to consider preserving scalar sequences in the DSL, + // and allowing them to merge with other concatenations. + try testConversion(#"\u{A B C}\u{d}efg"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + "\u{D}efg" + } + """#) + + // FIXME: We don't actually have a way of specifying in the DSL that we + // shouldn't join these together, should we print them as regex instead? + try testConversion(#"a(?:\u{301})"#, #""" + Regex { + "a" + "\u{301}" } """#) } From b61c7708d2115767e85e799630272822c0a58da4 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:04 +0100 Subject: [PATCH 24/28] Fix scalar mode for quoted sequences in character class Previously we would only match entire characters. Update to use the generic Character consumer logic that can handle scalar semantic mode. rdar://97209131 --- .../_StringProcessing/ConsumerInterface.swift | 24 ++++------ Tests/RegexTests/MatchTests.swift | 46 +++++++++++++++++++ 2 files changed, 56 insertions(+), 14 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index c96775500..cb9c79fa6 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -63,7 +63,7 @@ extension DSLTree._AST.Atom { extension Character { func generateConsumer( _ opts: MatchingOptions - ) throws -> MEProgram.ConsumeFunction? { + ) throws -> MEProgram.ConsumeFunction { let isCaseInsensitive = opts.isCaseInsensitive switch opts.semanticLevel { case .graphemeCluster: @@ -456,21 +456,17 @@ extension DSLTree.CustomCharacterClass.Member { } return rhs(input, bounds) } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + case .quotedLiteral(let str): + let consumers = try str.map { + try $0.generateConsumer(opts) + } + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx } - return input.index(after: bounds.lowerBound) } + return nil } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 7d65cd132..373026d5f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -191,6 +191,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -203,6 +204,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + semanticLevel: semanticLevel, file: file, line: line) } @@ -728,6 +730,50 @@ extension RegexTests { ("a\u{301}", true), semanticLevel: .unicodeScalar) + // Scalar matching in quoted sequences. + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", nil), + ("\u{C9}", nil) + ) + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", nil), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "\u{301}"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", "E\u{301}"), + ("\u{C9}", "\u{C9}") + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", "E"), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "E"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters From bda6fbcb701418613c7b2699dbc3bcfc6e07eb02 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:05 +0100 Subject: [PATCH 25/28] Form ASCII bitsets for quoted sequences in character classes --- .../_StringProcessing/ConsumerInterface.swift | 25 ++++++++++--------- Tests/RegexTests/CompileTests.swift | 9 +++++++ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index cb9c79fa6..0c89faae0 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -327,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions, _ isInverted: Bool ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset switch self { case let .atom(a): if let val = a.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - val, - isInverted, - opts.isCaseInsensitive - ) + return Bitset(val, isInverted, opts.isCaseInsensitive) } case let .range(low, high): - if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - low: lowVal, - high: highVal, - isInverted: isInverted, - isCaseInsensitive: opts.isCaseInsensitive - ) + if let lowVal = low.singleScalarASCIIValue, + let highVal = high.singleScalarASCIIValue { + return Bitset(low: lowVal, high: highVal, isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive) + } + case .quotedLiteral(let str): + var bitset = Bitset(isInverted: isInverted) + for c in str { + guard let ascii = c._singleScalarAsciiValue else { return nil } + bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive)) } + return bitset default: return nil } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 6c8f66e10..90694fc19 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -317,6 +317,15 @@ extension RegexTests { semanticLevel: .unicodeScalar, contains: [.matchBitsetScalar], doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: #"[\Qab\Ec]"#, + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: #"[\Qab\Ec]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } func testScalarOptimizeCompilation() { From e9838da3287a3a37b6c0a1a9cb15d8738c8f4180 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:05 +0100 Subject: [PATCH 26/28] Coalesce character class members In grapheme semantic mode, coalesce adjacent character and scalar members of a custom character class, over which we can perform grapheme breaking. This involves potentially re-writing ranges such that they contain a complete grapheme of adjacent scalars. --- Sources/_StringProcessing/ByteCodeGen.swift | 118 +++++++++ Sources/_StringProcessing/Utility/Misc.swift | 20 +- Tests/RegexTests/MatchTests.swift | 247 +++++++++++++++++++ Tests/RegexTests/ParseTests.swift | 2 + 4 files changed, 383 insertions(+), 4 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index da21ea26a..446d62d30 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -775,9 +775,127 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Coalesce any adjacent scalar members in a custom character class together. + /// This is required in order to produce correct grapheme matching behavior. + func coalescingCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + struct Accumulator { + /// A series of range operands. For example, in `[ab-cde-fg]`, this will + /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting + /// ranges will be created. + private var rangeOperands: [String] = [""] + + /// The current range operand. + private var current: String { + _read { yield rangeOperands[rangeOperands.count - 1] } + _modify { yield &rangeOperands[rangeOperands.count - 1] } + } + + /// Try to accumulate a character class member, returning `true` if + /// successful, `false` otherwise. + mutating func tryAccumulate( + _ member: DSLTree.CustomCharacterClass.Member + ) -> Bool { + switch member { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + current.append(c) + return true + case .quotedLiteral(let str): + current += str + return true + case let .range(lhs, rhs): + guard let lhs = lhs.literalCharacterValue, + let rhs = rhs.literalCharacterValue + else { return false } + current.append(lhs) + rangeOperands.append(String(rhs)) + return true + default: + return false + } + } + + func finish() -> [DSLTree.CustomCharacterClass.Member] { + if rangeOperands.count == 1 { + // If we didn't have any additional range operands, this isn't a + // range, we can just form a standard quoted literal. + return [.quotedLiteral(current)] + } + var members = [DSLTree.CustomCharacterClass.Member]() + + // We have other range operands, splice them together. For N operands + // we have N - 1 ranges. + for (i, lhs) in rangeOperands.dropLast().enumerated() { + let rhs = rangeOperands[i + 1] + + // If this is the first operand we only need to drop the last + // character for its quoted members, otherwise this is both an LHS + // and RHS of a range, and as such needs both sides trimmed. + let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast() + if !leading.isEmpty { + members.append(.quotedLiteral(String(leading))) + } + members.append(.range(.char(lhs.last!), .char(rhs.first!))) + } + // We've handled everything except the quoted portion of the last + // operand, add it now. + let trailing = rangeOperands.last!.dropFirst() + if !trailing.isEmpty { + members.append(.quotedLiteral(String(trailing))) + } + return members + } + } + return members + .map { m -> DSLTree.CustomCharacterClass.Member in + // First we need to recursively coalsce any child character classes. + switch m { + case .custom(let ccc): + return .custom(coalescingCustomCharacterClass(ccc)) + case .intersection(let lhs, let rhs): + return .intersection( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .subtraction(let lhs, let rhs): + return .subtraction( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .symmetricDifference(let lhs, let rhs): + return .symmetricDifference( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .atom, .range, .quotedLiteral, .trivia: + return m + } + } + .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in + accum.tryAccumulate(member) + } + } + + func coalescingCustomCharacterClass( + _ ccc: DSLTree.CustomCharacterClass + ) -> DSLTree.CustomCharacterClass { + // This only needs to be done in grapheme semantic mode. In scalar semantic + // mode, we don't want to coalesce any scalars into a grapheme. This + // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and + // U+302. + guard options.semanticLevel == .graphemeCluster else { return ccc } + + let members = coalescingCustomCharacterClassMembers(ccc.members) + return .init(members: members, isInverted: ccc.isInverted) + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { + // Before emitting a custom character class in grapheme semantic mode, we + // need to coalesce together any adjacent characters and scalars, over which + // we can perform grapheme breaking. This includes e.g range bounds for + // `[e\u{301}-\u{302}]`. + let ccc = coalescingCustomCharacterClass(ccc) if let asciiBitset = ccc.asAsciiBitset(options), optimizationsEnabled { if options.semanticLevel == .unicodeScalar { diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift index 139a1be34..8a9cbe325 100644 --- a/Sources/_StringProcessing/Utility/Misc.swift +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -11,11 +11,11 @@ extension Array { /// Coalesce adjacent elements using a given accumulator. The accumulator is - /// transformed into an element of the array by `finish`. The `accumulate` + /// transformed into elements of the array by `finish`. The `accumulate` /// function should return `true` if the accumulator has coalesced the /// element, `false` otherwise. func coalescing( - with initialAccumulator: T, into finish: (T) -> Element, + with initialAccumulator: T, into finish: (T) -> Self, accumulate: (inout T, Element) -> Bool ) -> Self { var didAccumulate = false @@ -32,7 +32,7 @@ extension Array { if didAccumulate { // We have a leftover accumulator, which needs to be finished before we // can append the next element. - result.append(finish(accumulator)) + result += finish(accumulator) accumulator = initialAccumulator didAccumulate = false } @@ -40,8 +40,20 @@ extension Array { } // Handle a leftover accumulation. if didAccumulate { - result.append(finish(accumulator)) + result += finish(accumulator) } return result } + + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + coalescing( + with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 373026d5f..458574197 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -774,6 +774,253 @@ extension RegexTests { semanticLevel: .unicodeScalar ) + // Scalar coalescing. + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#, + ("👨", nil), + ("👩", nil), + ("👧", nil), + ("👦", nil), + ("\u{200D}", nil), + ("👨‍👩‍👧‍👦", "👨‍👩‍👧‍👦") + ) + firstMatchTests( + #"[👨\u{200D}👩\u{200D}👧\u{200D}👦]"#, + ("👨", "👨"), + ("👩", "👩"), + ("👧", "👧"), + ("👦", "👦"), + ("\u{200D}", "\u{200D}"), + ("👨‍👩‍👧‍👦", "👨"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + firstMatchTests( + #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, + ("a", "a"), + ("a\u{301}", "a\u{301}"), + ("\u{E1}", "\u{E1}"), + ("\u{E2}", nil), + ("z", "z"), + ("e", "e"), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("\u{302}", "\u{302}"), + ("1", "1"), + ("2", nil), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", nil), + ("7", nil), + ("8", nil), + ("9", "9") + ) + firstMatchTests( + #"[ab-df-hik-lm]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", "d"), + ("e", nil), + ("f", "f"), + ("g", "g"), + ("h", "h"), + ("i", "i"), + ("j", nil), + ("k", "k"), + ("l", "l"), + ("m", "m") + ) + firstMatchTests( + #"[a-ce-fh-j]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", nil), + ("e", "e"), + ("f", "f"), + ("g", nil), + ("h", "h"), + ("i", "i"), + ("j", "j") + ) + + + // These can't compile in grapheme semantic mode, but make sure they work in + // scalar semantic mode. + firstMatchTests( + #"[a\u{315}\u{301}-\u{302}]"#, + ("a", "a"), + ("\u{315}", "\u{315}"), + ("\u{301}", "\u{301}"), + ("\u{302}", "\u{302}"), + ("\u{303}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + ("\u{73}", "\u{73}"), + ("\u{323}", "\u{323}"), + ("\u{307}", "\u{307}"), + ("\u{400}", "\u{400}"), + ("\u{500}", "\u{500}"), + ("\u{1E00}", "\u{1E00}"), + ("\u{1E01}", nil), + ("\u{1E69}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[a\u{302}-✅]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("✅", "✅"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[a\u{302}-✅]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "A"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("✅", "✅"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "\u{301}"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "E"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + + // Set operation scalar coalescing. + firstMatchTests( + #"[e\u{301}&&e\u{301}e\u{302}]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", "e\u{301}"), + ("e\u{302}", nil)) + firstMatchTests( + #"[e\u{301}~~[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", nil), + ("e\u{302}", "e\u{302}")) + firstMatchTests( + #"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index f5e93c2bd..84ce361f3 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2929,6 +2929,8 @@ extension RegexTests { diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e")) + diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) From d5cad1c7015e934f1f327e3d1fe080bd003955a0 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:06 +0100 Subject: [PATCH 27/28] Throw `RegexCompilationError` for invalid character class bounds Make sure we throw the right error for ranges that are invalid in grapheme mode, but are valid in scalar mode. --- Sources/_StringProcessing/Compiler.swift | 30 ++++++++++++-- .../_StringProcessing/ConsumerInterface.swift | 12 +++++- Tests/RegexTests/CompileTests.swift | 40 +++++++++++++++++++ 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..b8daa8b21 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,19 +42,43 @@ class Compiler { } } +/// Hashable wrapper for `Any.Type`. +struct AnyHashableType: CustomStringConvertible, Hashable { + var ty: Any.Type + init(_ ty: Any.Type) { + self.ty = ty + } + var description: String { "\(ty)" } + + static func == (lhs: Self, rhs: Self) -> Bool { + lhs.ty == rhs.ty + } + func hash(into hasher: inout Hasher) { + hasher.combine(ObjectIdentifier(ty)) + } +} + // An error produced when compiling a regular expression. -enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, Hashable, CustomStringConvertible { // TODO: Source location? case uncapturedReference + case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType) + case invalidCharacterClassRangeOperand(Character) + + static func incorrectOutputType( + incorrect: Any.Type, correct: Any.Type + ) -> Self { + .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct)) + } - case incorrectOutputType(incorrect: Any.Type, correct: Any.Type) - var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." case .incorrectOutputType(let incorrect, let correct): return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'" + case .invalidCharacterClassRangeOperand(let c): + return "'\(c)' is an invalid bound for character class range" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 0c89faae0..083781120 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -362,12 +362,20 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else { + guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else { + guard let rhsChar = high.literalCharacterValue else { throw Unsupported("\(high) in range") } + + // We must have NFC single scalar bounds. + guard let lhs = lhsChar.singleScalar, lhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar) + } + guard let rhs = rhsChar.singleScalar, rhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar) + } guard lhs <= rhs else { throw Unsupported("Invalid range \(low)-\(high)") } diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 90694fc19..27f8d79cb 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -11,6 +11,7 @@ @testable import _RegexParser @testable import _StringProcessing +import TestSupport import XCTest @@ -168,6 +169,45 @@ extension RegexTests { } } + private func testCompileError( + _ regex: String, _ error: RegexCompilationError, + file: StaticString = #file, line: UInt = #line + ) { + do { + _ = try _compileRegex(regex) + XCTFail("Expected compile error", file: file, line: line) + } catch let err as RegexCompilationError { + XCTAssertEqual(err, error, file: file, line: line) + } catch { + XCTFail("Unknown compile error", file: file, line: line) + } + } + + func testInvalidScalarCoalescing() throws { + guard ensureNewStdlib() else { return } + + // Non-single-scalar bounds. + testCompileError( + #"[a\u{302}-✅]"#, .invalidCharacterClassRangeOperand("a\u{302}")) + testCompileError( + #"[e\u{301}-\u{302}]"#, .invalidCharacterClassRangeOperand("e\u{301}")) + testCompileError( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + .invalidCharacterClassRangeOperand("\u{73}\u{323}\u{307}")) + testCompileError( + #"[a\u{315}\u{301}-\u{302}]"#, + .invalidCharacterClassRangeOperand("a\u{315}\u{301}") + ) + testCompileError( + #"[a-z1e\u{301}-\u{302}\u{E1}3-59]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + testCompileError( + #"[[e\u{301}-\u{302}]&&e\u{303}]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + } + func testCompileQuantification() throws { // NOTE: While we might change how we compile From f2d44ff601bbed32884f9da36e9dc8bb3b24a6b3 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Wed, 20 Jul 2022 21:22:06 +0100 Subject: [PATCH 28/28] Allow coalescing through trivia I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that. --- .../Regex/Parse/LexicalAnalysis.swift | 46 +++++++-------- Sources/_StringProcessing/ByteCodeGen.swift | 8 +++ .../_StringProcessing/PrintAsPattern.swift | 4 ++ Tests/RegexTests/MatchTests.swift | 56 +++++++++++++++++++ Tests/RegexTests/RenderDSLTests.swift | 12 ++++ 5 files changed, 104 insertions(+), 22 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 4a4f5c05f..a830a18b7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -480,35 +480,37 @@ extension Parser { /// mutating func lexQuantifier( ) -> (Located, Located, [AST.Trivia])? { - var trivia: [AST.Trivia] = [] + tryEating { p in + var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = recordLoc { p in - if p.tryEat("*") { return .zeroOrMore } - if p.tryEat("+") { return .oneOrMore } - if p.tryEat("?") { return .zeroOrOne } + let amt: Located? = p.recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return p.tryEating { p in - guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), - p.tryEat("}") - else { return nil } - return range.value + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") + else { return nil } + return range.value + } } - } - guard let amt = amt else { return nil } + guard let amt = amt else { return nil } - // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { p in - if p.tryEat("?") { return .reluctant } - if p.tryEat("+") { return .possessive } - return .eager - } + let kind: Located = p.recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } + return .eager + } - return (amt, kind, trivia) + return (amt, kind, trivia) + } } /// Try to consume a range, returning `nil` if unsuccessful. diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 446d62d30..e8c92f2b5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -812,6 +812,10 @@ fileprivate extension Compiler.ByteCodeGen { current.append(lhs) rangeOperands.append(String(rhs)) return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !current.isEmpty default: return false } @@ -935,6 +939,10 @@ fileprivate extension Compiler.ByteCodeGen { case .quotedLiteral(let q): str += q return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty default: return false } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 9035d2f51..c1753c49d 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -304,6 +304,10 @@ extension PrettyPrinter { case .quotedLiteral(let q): literal.append(q) return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !literal.isEmpty default: return false } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 458574197..8e01582a9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -345,6 +345,23 @@ extension RegexTests { input: "e\u{301}0e\u{302}", match: "e\u{301}0e\u{302}" ) + firstMatchTest( + #"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + + // We don't coalesce across groups. + firstMatchTests( + #"e\u{301}(?:\u{315}\u{35C})?"#, + ("e\u{301}", "e\u{301}"), + ("e\u{301}\u{315}\u{35C}", nil) + ) // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, @@ -833,6 +850,30 @@ extension RegexTests { ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") ) + firstMatchTests( + #"(?x) [ e \u{315} \u{301} \u{35C} ]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + // We don't coalesce across character classes. + firstMatchTests( + #"e[\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{315}\u{301}", nil), + ("e\u{301}\u{315}\u{35C}", nil) + ) + firstMatchTests( + #"[e[\u{301}]]"#, + ("e", "e"), + ("\u{301}", "\u{301}"), + ("e\u{301}", nil) + ) firstMatchTests( #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, @@ -1021,6 +1062,16 @@ extension RegexTests { ("e\u{302}", nil), ("e\u{303}", "e\u{303}")) + firstMatchTests( + #"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters @@ -2191,6 +2242,11 @@ extension RegexTests { #"\u{65 301}"#, (eComposed, true), (eDecomposed, true)) + + matchTest( + #"(?x) \u{65} \u{301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 65a9422f2..e925d255c 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -177,6 +177,18 @@ extension RenderDSLTests { } """#) + try testConversion(#"(?x) a \u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) [ a b c \u{301} ] "#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #""" Regex { "👨\u{200D}👨\u{200D}👧\u{200D}👦"