From d6a6e225a600c7c350119cd78e424515201582ca Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:37 +0100 Subject: [PATCH 01/14] Allow matching tests to specify semantic level --- Tests/RegexTests/MatchTests.swift | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d375065ab..bb0d2b63e 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -24,9 +24,10 @@ func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, + semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?]) { - var regex = try Regex(regexStr, syntax: syntax) + var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) guard let result = try regex.firstMatch(in: input) else { throw MatchError("match not found for \(regexStr) in \(input)") } @@ -54,6 +55,7 @@ func flatCaptureTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -63,6 +65,7 @@ func flatCaptureTest( regex, input: test, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax ) else { if expect == nil { @@ -113,6 +116,7 @@ func matchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -126,6 +130,7 @@ func matchTest( dumpAST: dumpAST, xfail: xfail, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, file: file, line: line) } @@ -143,6 +148,7 @@ func firstMatchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -151,6 +157,7 @@ func firstMatchTest( regex, input: input, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax) if xfail { From cf3cfbe06ac21d4b8703365ce867ebf7f9cf6d5f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:38 +0100 Subject: [PATCH 02/14] Rip out unused _CharacterClassModel API Remove the DSL -> _CharacterClassModel conversion, and _CharacterClassModel's custom character class matching logic, none of which is being used. --- .../_CharacterClassModel.swift | 192 ------------------ 1 file changed, 192 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..c1183972b 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -50,74 +50,6 @@ public struct _CharacterClassModel: Hashable { case whitespace /// Character.isLetter or Character.isDigit or Character == "_" case word - /// One of the custom character set. - case custom([CharacterSetComponent]) - } - - public enum SetOperator: Hashable { - case subtraction - case intersection - case symmetricDifference - } - - /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { - var lhs: CharacterSetComponent - var op: SetOperator - var rhs: CharacterSetComponent - - func matches(_ c: Character, with options: MatchingOptions) -> Bool { - switch op { - case .intersection: - return lhs.matches(c, with: options) && rhs.matches(c, with: options) - case .subtraction: - return lhs.matches(c, with: options) && !rhs.matches(c, with: options) - case .symmetricDifference: - return lhs.matches(c, with: options) != rhs.matches(c, with: options) - } - } - } - - public enum CharacterSetComponent: Hashable { - case character(Character) - case range(ClosedRange) - - /// A nested character class. - case characterClass(_CharacterClassModel) - - /// A binary set operation of character class components. - indirect case setOperation(SetOperation) - - public static func setOperation( - lhs: CharacterSetComponent, op: SetOperator, rhs: CharacterSetComponent - ) -> CharacterSetComponent { - .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) - } - - func matches(_ character: Character, with options: MatchingOptions) -> Bool { - switch self { - case .character(let c): - if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() - } else { - return c == character - } - case .range(let range): - if options.isCaseInsensitive { - let newLower = range.lowerBound.lowercased() - let newUpper = range.upperBound.lowercased() - // FIXME: Is failing this possible? Is this the right behavior if so? - guard newLower <= newUpper else { return false } - return (newLower...newUpper).contains(character.lowercased()) - } else { - return range.contains(character) - } - case .characterClass(let custom): - let str = String(character) - return custom.matches(in: str, at: str.startIndex, with: options) != nil - case .setOperation(let op): return op.matches(character, with: options) - } - } } enum MatchLevel: Hashable { @@ -188,8 +120,6 @@ public struct _CharacterClassModel: Hashable { matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -222,8 +152,6 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() @@ -286,23 +214,6 @@ extension _CharacterClassModel { public static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } - - public static func custom( - _ components: [_CharacterClassModel.CharacterSetComponent] - ) -> _CharacterClassModel { - .init(cc: .custom(components), matchLevel: .graphemeCluster) - } -} - -extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { - public var description: String { - switch self { - case .range(let range): return "" - case .character(let character): return "" - case .characterClass(let custom): return "\(custom)" - case .setOperation(let op): return "<\(op.lhs) \(op.op) \(op.rhs)>" - } - } } extension _CharacterClassModel.Representation: CustomStringConvertible { @@ -318,7 +229,6 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .verticalWhitespace: return "vertical whitespace" case .whitespace: return "" case .word: return "" - case .custom(let set): return "" } } } @@ -391,22 +301,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Node { - var characterClass: _CharacterClassModel? { - switch self { - case let .customCharacterClass(ccc): - return ccc.modelCharacterClass - case let .atom(a): - return a.characterClass - case .characterPredicate: - // FIXME: Do we make one from this? - return nil - default: - return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel @@ -417,17 +311,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Atom { - var characterClass: _CharacterClassModel? { - switch self { - case let .unconverted(a): - return a.ast.characterClass - - default: return nil - } - } -} - extension AST.Atom { var characterClass: _CharacterClassModel? { switch kind { @@ -489,81 +372,6 @@ extension AST.Atom.EscapedBuiltin { } } -extension DSLTree.CustomCharacterClass { - // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: _CharacterClassModel? { - var result = - Array<_CharacterClassModel.CharacterSetComponent>() - for m in members { - switch m { - case let .atom(a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let c = a.literalCharacterValue { - result.append(.character(c)) - } else { - return nil - } - case let .range(low, high): - guard let lhs = low.literalCharacterValue, - let rhs = high.literalCharacterValue - else { - return nil - } - result.append(.range(lhs...rhs)) - - case let .custom(ccc): - guard let cc = ccc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - - case let .intersection(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .intersection, - rhs: .characterClass(rhs))) - - case let .subtraction(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .subtraction, - rhs: .characterClass(rhs))) - - case let .symmetricDifference(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .symmetricDifference, - rhs: .characterClass(rhs))) - - case let .quotedLiteral(s): - // Decompose quoted literal into literal characters. - result += s.map { .character($0) } - - case .trivia: - break - } - } - let cc = _CharacterClassModel.custom(result) - return isInverted ? cc.inverted : cc - } -} - extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the From c36eb1f091614eae13de037ebe2f815256c1b057 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:38 +0100 Subject: [PATCH 03/14] Remove _CharacterClassModel conformance to RegexComponent --- Sources/_StringProcessing/_CharacterClassModel.swift | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c1183972b..e280ba473 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -161,18 +161,6 @@ public struct _CharacterClassModel: Hashable { } } -@available(SwiftStdlib 5.7, *) -extension _CharacterClassModel: RegexComponent { - public typealias RegexOutput = Substring - - public var regex: Regex { - guard let ast = self.makeAST() else { - fatalError("FIXME: extended AST?") - } - return Regex(ast: ast) - } -} - @_spi(RegexBuilder) extension _CharacterClassModel { public static var any: _CharacterClassModel { From 3b4a34f06828936fd9f8e27e0a2e5d2008161a66 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:39 +0100 Subject: [PATCH 04/14] Internalize `_CharacterClassModel` `makeDSLTreeCharacterClass` was the last API that required it to be public. Remove it, and replace it with some static members on `_AST.Atom`. --- Sources/RegexBuilder/CharacterClass.swift | 21 ++--- Sources/_StringProcessing/Regex/DSLTree.swift | 26 +++++ .../_CharacterClassModel.swift | 94 +++---------------- 3 files changed, 50 insertions(+), 91 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index a6d18b2cf..4e96e510d 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -20,11 +20,8 @@ public struct CharacterClass { self.ccc = ccc } - init(unconverted model: _CharacterClassModel) { - guard let ccc = model.makeDSLTreeCharacterClass() else { - fatalError("Unsupported character class") - } - self.ccc = ccc + init(unconverted atom: DSLTree._AST.Atom) { + self.ccc = .init(members: [.atom(.unconverted(atom))]) } } @@ -49,15 +46,15 @@ extension RegexComponent where Self == CharacterClass { } public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: .anyGrapheme) + .init(unconverted: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) + .init(unconverted: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: .digit) + .init(unconverted: ._digit) } public static var hexDigit: CharacterClass { @@ -69,19 +66,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + .init(unconverted: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + .init(unconverted: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: .verticalWhitespace) + .init(unconverted: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: .word) + .init(unconverted: ._word) } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 740bdcb8d..eb357ae87 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -820,6 +820,32 @@ extension DSLTree { @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom + + // FIXME: The below APIs should be removed once the DSL tree has been + // migrated to use proper DSL atoms for them. + + public static var _anyGrapheme: Self { + .init(ast: .init(.escaped(.graphemeCluster), .fake)) + } + public static var _whitespace: Self { + .init(ast: .init(.escaped(.whitespace), .fake)) + } + public static var _digit: Self { + .init(ast: .init(.escaped(.decimalDigit), .fake)) + } + public static var _horizontalWhitespace: Self { + .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) + } + public static var _newlineSequence: Self { + // FIXME: newline sequence is not same as \n + .init(ast: .init(.escaped(.newline), .fake)) + } + public static var _verticalWhitespace: Self { + .init(ast: .init(.escaped(.verticalTab), .fake)) + } + public static var _word: Self { + .init(ast: .init(.escaped(.wordCharacter), .fake)) + } } } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index e280ba473..c0de6ebaa 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,8 +15,7 @@ // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -@_spi(RegexBuilder) -public struct _CharacterClassModel: Hashable { +struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -28,7 +27,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -85,7 +84,7 @@ public struct _CharacterClassModel: Hashable { } /// Inverts a character class. - public var inverted: Self { + var inverted: Self { return withInversion(true) } @@ -161,51 +160,50 @@ public struct _CharacterClassModel: Hashable { } } -@_spi(RegexBuilder) extension _CharacterClassModel { - public static var any: _CharacterClassModel { + static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: _CharacterClassModel { + static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var anyUnicodeScalar: _CharacterClassModel { + static var anyUnicodeScalar: _CharacterClassModel { .init(cc: .any, matchLevel: .unicodeScalar) } - public static var whitespace: _CharacterClassModel { + static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: _CharacterClassModel { + static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: _CharacterClassModel { + static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: _CharacterClassModel { + static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: _CharacterClassModel { + static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: _CharacterClassModel { + static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: _CharacterClassModel { + static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } } extension _CharacterClassModel.Representation: CustomStringConvertible { - public var description: String { + var description: String { switch self { case .any: return "" case .anyGrapheme: return "" @@ -222,73 +220,11 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { } extension _CharacterClassModel: CustomStringConvertible { - public var description: String { + var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension _CharacterClassModel { - public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch makeAST() { - case .atom(let atom): - return .init(members: [.atom(.unconverted(.init(ast: atom)))]) - default: - return nil - } - } - - internal func makeAST() -> AST.Node? { - let inv = isInverted - - func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { - escaped(b) - } - - switch cc { - case .any: return atom(.any) - - case .digit: - return esc(inv ? .notDecimalDigit : .decimalDigit) - - case .horizontalWhitespace: - return esc( - inv ? .notHorizontalWhitespace : .horizontalWhitespace) - - // FIXME: newline sequence is not same as \n - case .newlineSequence: - return esc(inv ? .notNewline : .newline) - - case .whitespace: - return esc(inv ? .notWhitespace : .whitespace) - - case .verticalWhitespace: - return esc(inv ? .notVerticalTab : .verticalTab) - - case .word: - return esc(inv ? .notWordCharacter : .wordCharacter) - - case .anyGrapheme: - return esc(.graphemeCluster) - - case .hexDigit: - let members: [AST.CustomCharacterClass.Member] = [ - range_m(.char("a"), .char("f")), - range_m(.char("A"), .char("F")), - range_m(.char("0"), .char("9")), - ] - let ccc = AST.CustomCharacterClass( - .init(faking: inv ? .inverted : .normal), - members, - .fake) - - return .customCharacterClass(ccc) - - default: return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel From b72e2622ed211ba7b91d1811a5b6852bc5ac6dbc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:39 +0100 Subject: [PATCH 05/14] Fix `CharacterClass.newlineSequence` Map to `.newlineSequence` instead of `.newline`, which allows it to create the correct consumer. rdar://96330096 --- Sources/_StringProcessing/Regex/DSLTree.swift | 3 +- Tests/RegexBuilderTests/RegexDSLTests.swift | 110 ++++++++++++++++++ Tests/RegexTests/MatchTests.swift | 43 +++++++ 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index eb357ae87..dc695cbf1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -837,8 +837,7 @@ extension DSLTree { .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) } public static var _newlineSequence: Self { - // FIXME: newline sequence is not same as \n - .init(ast: .init(.escaped(.newline), .fake)) + .init(ast: .init(.escaped(.newlineSequence), .fake)) } public static var _verticalWhitespace: Self { .init(ast: .init(.escaped(.verticalTab), .fake)) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..d750884c1 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -110,6 +110,116 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + // `.newlineSequence` and `.verticalWhitespace` match the same set of + // newlines in grapheme semantic mode, and scalar mode when applied with + // OneOrMore. + for cc in [CharacterClass.newlineSequence, .verticalWhitespace] { + for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], allNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode) + } + + // Try with ASCII-only whitespace. + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], asciiNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode).asciiOnlyWhitespace() + } + } + } + + // `.newlineSequence` in scalar mode may match a single `\r\n`. + // `.verticalWhitespace` may not. + for asciiOnly in [true, false] { + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", "\r\n"), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + "\n" + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + } } func testCharacterClassOperations() throws { diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index bb0d2b63e..8f7baf4b9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -634,6 +634,49 @@ extension RegexTests { ("\n", true), ("\r", true)) + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + firstMatchTest( + #"\R+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + firstMatchTest( + #"\v+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + } + + // In scalar mode, \R can match \r\n, \v cannot. + firstMatchTest( + #"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar) + + // ASCII-only spaces. + firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest( + #"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters From 6fb86fcd79c55d67f77b7dde20a3a6c9382b692b Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 06/14] Rename `any` -> `dot` Explicitly disambiguate the fact we're talking about `.`, which does not match newlines unless in single line mode. --- Sources/RegexBuilder/CharacterClass.swift | 2 +- Sources/_RegexParser/Regex/AST/Atom.swift | 8 ++++---- .../_RegexParser/Regex/Parse/LexicalAnalysis.swift | 2 +- Sources/_RegexParser/Regex/Parse/Sema.swift | 4 ++-- Sources/_RegexParser/Regex/Printing/DumpAST.swift | 2 +- Sources/_StringProcessing/ByteCodeGen.swift | 10 +++++----- Sources/_StringProcessing/ConsumerInterface.swift | 6 +++--- Sources/_StringProcessing/PrintAsPattern.swift | 10 ++++++---- Sources/_StringProcessing/Regex/ASTConversion.swift | 2 +- Sources/_StringProcessing/Regex/DSLTree.swift | 13 ++++++++----- .../_StringProcessing/_CharacterClassModel.swift | 6 +++--- Tests/RegexTests/ParseTests.swift | 10 +++++----- 12 files changed, 40 insertions(+), 35 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4e96e510d..8b4a21fb7 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -42,7 +42,7 @@ extension CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { public static var any: CharacterClass { - .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) + .init(DSLTree.CustomCharacterClass(members: [.atom(.dot)])) } public static var anyGraphemeCluster: CharacterClass { diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f1419ad78..2e39c9c4c 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -60,7 +60,7 @@ extension AST { case namedCharacter(String) /// . - case any + case dot /// ^ case startOfLine @@ -104,7 +104,7 @@ extension AST.Atom { case .callout(let v): return v case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v - case .any: return nil + case .dot: return nil case .startOfLine: return nil case .endOfLine: return nil case .invalid: return nil @@ -806,7 +806,7 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .any, .startOfLine, .endOfLine, + case .scalarSequence, .property, .dot, .startOfLine, .endOfLine, .backreference, .subpattern, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -858,7 +858,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .any, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .startOfLine, .endOfLine, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 2168dbb03..d14a17785 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2073,7 +2073,7 @@ extension Parser { p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters - case ".": return customCC ? .char(".") : .any + case ".": return customCC ? .char(".") : .dot case "^": return customCC ? .char("^") : .startOfLine case "$": return customCC ? .char("$") : .endOfLine diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 0aeee282d..88744bae2 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -221,7 +221,7 @@ extension RegexValidator { ) { switch esc { case .resetStartOfMatch, .singleDataUnit, .trueAnychar, - // '\N' needs to be emitted using 'emitAny'. + // '\N' needs to be emitted using 'emitDot'. .notNewline: error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .any: + case .char, .scalar, .startOfLine, .endOfLine, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 48a2512cf..68c32e4a1 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,7 +153,7 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .any: return "." + case .dot: return "." case .startOfLine: return "^" case .endOfLine: return "$" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..07fe09016 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -55,8 +55,8 @@ fileprivate extension Compiler.ByteCodeGen { } } switch a { - case .any: - emitAny() + case .dot: + emitDot() case let .char(c): try emitCharacter(c) @@ -282,7 +282,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitAny() { + mutating func emitDot() { switch (options.semanticLevel, options.dotMatchesNewline) { case (.graphemeCluster, true): builder.buildAdvance(1) @@ -758,9 +758,9 @@ fileprivate extension Compiler.ByteCodeGen { try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): - if ccc.containsAny { + if ccc.containsDot { if !ccc.isInverted { - emitAny() + emitDot() } else { throw Unsupported("Inverted any") } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..43ec45f6c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .any: + case .dot: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in @@ -264,10 +264,10 @@ extension AST.Atom { case let .namedCharacter(name): return consumeName(name, opts: opts) - case .any: + case .dot: assertionFailure( "Should have been handled by tree conversion") - fatalError(".atom(.any) is handled in emitAny") + fatalError(".atom(.dot) is handled in emitDot") case .startOfLine, .endOfLine: // handled in emitAssertion diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..d936e73b7 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -895,7 +895,8 @@ extension AST.Atom { case .namedCharacter: return (" /* TODO: named character */", false) - case .any: + case .dot: + // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) case .startOfLine, .endOfLine: @@ -950,7 +951,7 @@ extension AST.Atom { case .namedCharacter(let n): return "\\N{\(n)}" - case .any: + case .dot: return "." case .startOfLine, .endOfLine: @@ -1099,7 +1100,8 @@ extension DSLTree.Atom { _ printer: inout PrettyPrinter ) -> (String, canBeWrapped: Bool)? { switch self { - case .any: + case .dot: + // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) case let .char(c): @@ -1141,7 +1143,7 @@ extension DSLTree.Atom { var _regexBase: String { switch self { - case .any: + case .dot: return "." case let .char(c): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..12068e1bc 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -217,7 +217,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) case let .scalar(s): return .char(Character(s.value)) - case .any: return .any + case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index dc695cbf1..56cec73ab 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -117,11 +117,11 @@ extension DSLTree { var members: [Member] var isInverted: Bool - var containsAny: Bool { + var containsDot: Bool { members.contains { member in switch member { - case .atom(.any): return true - case .custom(let ccc): return ccc.containsAny + case .atom(.dot): return true + case .custom(let ccc): return ccc.containsDot default: return false } @@ -245,7 +245,10 @@ extension DSLTree { public enum Atom { case char(Character) case scalar(Unicode.Scalar) - case any + + /// The DSL representation of '.' in a regex literal. This does not match + /// newlines unless single line mode is enabled. + case dot case assertion(_AST.AssertionKind) case backreference(_AST.Reference) @@ -857,7 +860,7 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .dot, .backreference, .symbolicReference, .unconverted: return true } } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index c0de6ebaa..9f515f220 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -245,8 +245,8 @@ extension AST.Atom { // this? Or does grapheme-semantic mode complicate that? return nil - case .any: - // `.any` is handled in the matching engine by Compiler.emitAny() and in + case .dot: + // `.dot` is handled in the matching engine by Compiler.emitDot() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure // @@ -275,7 +275,7 @@ extension AST.Atom.EscapedBuiltin { // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through - // emitAny(). For now we treat it as semantically invalid. + // emitDot(). For now we treat it as semantically invalid. case .notNewline: return .newlineSequence.inverted case .whitespace: return .whitespace diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 3c43f27af..52a272915 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -359,14 +359,14 @@ extension RegexTests { parseTest( "(.)*(.*)", concat( - zeroOrMore(of: capture(atom(.any))), - capture(zeroOrMore(of: atom(.any)))), + zeroOrMore(of: capture(atom(.dot))), + capture(zeroOrMore(of: atom(.dot)))), captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( - zeroOrMore(of: capture(capture(atom(.any)))), - capture(zeroOrOne(of: capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.dot)))), + capture(zeroOrOne(of: capture(atom(.dot))))), captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, @@ -479,7 +479,7 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) - // FIXME: '\N' should be emitted through 'emitAny', not through the + // FIXME: '\N' should be emitted through 'emitDot', not through the // _CharacterClassModel model. parseTest(#"\N"#, escaped(.notNewline), unsupported: true) From 9757edd938aa6905b94a24d3b6adcd39d88b7665 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 07/14] Re-introduce `DSLTree.Atom.any` This time as a "true any" that matches any character, including newlines. --- Sources/_StringProcessing/ByteCodeGen.swift | 33 ++++++++++++------- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 6 ++++ Sources/_StringProcessing/Regex/DSLTree.swift | 6 +++- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 07fe09016..5e0b559cb 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -55,6 +55,9 @@ fileprivate extension Compiler.ByteCodeGen { } } switch a { + case .any: + emitAny() + case .dot: emitDot() @@ -282,23 +285,31 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitDot() { - switch (options.semanticLevel, options.dotMatchesNewline) { - case (.graphemeCluster, true): + mutating func emitAny() { + switch options.semanticLevel { + case .graphemeCluster: builder.buildAdvance(1) - case (.graphemeCluster, false): + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) builder.buildConsume { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) + input.unicodeScalars.index(after: bounds.lowerBound) } + } + } - case (.unicodeScalar, true): - // TODO: builder.buildAdvanceUnicodeScalar(1) + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + return + } + switch options.semanticLevel { + case .graphemeCluster: builder.buildConsume { input, bounds in - input.unicodeScalars.index(after: bounds.lowerBound) + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) } - case (.unicodeScalar, false): + case .unicodeScalar: builder.buildConsume { input, bounds in input[bounds.lowerBound].isNewline ? nil diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 43ec45f6c..fd29e6045 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .dot: + case .any, .dot: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index d936e73b7..0debe5059 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -1100,6 +1100,9 @@ extension DSLTree.Atom { _ printer: inout PrettyPrinter ) -> (String, canBeWrapped: Bool)? { switch self { + case .any: + return (".any", true) + case .dot: // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. return (".any", true) @@ -1143,6 +1146,9 @@ extension DSLTree.Atom { var _regexBase: String { switch self { + case .any: + return "(?s:.)" + case .dot: return "." diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 56cec73ab..7a4e7d30e 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -246,6 +246,9 @@ extension DSLTree { case char(Character) case scalar(Unicode.Scalar) + /// Any character, including newlines. + case any + /// The DSL representation of '.' in a regex literal. This does not match /// newlines unless single line mode is enabled. case dot @@ -860,7 +863,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .dot, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .any, .dot, .backreference, .symbolicReference, + .unconverted: return true } } From fbe598c6a7a069d23c031636bdf53e8f58cae620 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:40 +0100 Subject: [PATCH 08/14] Fix `CharacterClass.any` This should map to `.any`, not `.dot`. rdar://96509234 --- Sources/RegexBuilder/CharacterClass.swift | 2 +- .../_StringProcessing/ConsumerInterface.swift | 5 ++++- .../_StringProcessing/PrintAsPattern.swift | 8 ++++---- Tests/RegexBuilderTests/RegexDSLTests.swift | 20 ++++++++++++++++--- Tests/RegexTests/RenderDSLTests.swift | 17 ++++++++++++++++ 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 8b4a21fb7..4e96e510d 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -42,7 +42,7 @@ extension CharacterClass { @available(SwiftStdlib 5.7, *) extension RegexComponent where Self == CharacterClass { public static var any: CharacterClass { - .init(DSLTree.CustomCharacterClass(members: [.atom(.dot)])) + .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } public static var anyGraphemeCluster: CharacterClass { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index fd29e6045..afc507c41 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,7 +111,7 @@ extension DSLTree.Atom { : $0 == s } - case .any, .dot: + case .any: // FIXME: Should this be a total ordering? if opts.semanticLevel == .graphemeCluster { return { input, bounds in @@ -123,6 +123,9 @@ extension DSLTree.Atom { } } + case .dot: + throw Unreachable(".atom(.dot) should be handled by emitDot") + case .assertion: // TODO: We could handle, should this be total? return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 0debe5059..9332756f1 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -896,8 +896,8 @@ extension AST.Atom { return (" /* TODO: named character */", false) case .dot: - // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. - return (".any", true) + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case .startOfLine, .endOfLine: fatalError("unreachable") @@ -1104,8 +1104,8 @@ extension DSLTree.Atom { return (".any", true) case .dot: - // FIXME: This is wrong, the DSL doesn't have an equivalent to .dot. - return (".any", true) + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case let .char(c): return (String(c)._quoted, false) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index d750884c1..013517bdc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -69,6 +69,9 @@ class RegexDSLTests: XCTestCase { XCTAssertTrue(match.output == substringMatch.output) } + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + func testCharacterClasses() throws { try _testDSLCaptures( ("a c", ("a c", " ", "c")), @@ -111,9 +114,6 @@ class RegexDSLTests: XCTestCase { } } - let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" - let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" - // `.newlineSequence` and `.verticalWhitespace` match the same set of // newlines in grapheme semantic mode, and scalar mode when applied with // OneOrMore. @@ -243,6 +243,20 @@ class RegexDSLTests: XCTestCase { } } + func testAny() throws { + // .any matches newlines regardless of matching options. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.any) + }.dotMatchesNewlines(dotMatchesNewline) + } + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..460cc8e14 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -68,6 +68,23 @@ extension RenderDSLTests { } """) } + + func testDot() throws { + try testConversion(#".+"#, #""" + Regex { + OneOrMore { + /./ + } + } + """#) + try testConversion(#"a.c"#, #""" + Regex { + "a" + /./ + "c" + } + """#) + } func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { From cb9c5fb66b9ed4d8a242f51f8fd6d3c2ac5db017 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:41 +0100 Subject: [PATCH 09/14] Rename `startOfLine`/`endOfLine` -> `caretAnchor`/`dollarAnchor` --- Sources/_RegexParser/Regex/AST/Atom.swift | 26 +++++++++---------- .../Regex/Parse/LexicalAnalysis.swift | 4 +-- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 +- .../_RegexParser/Regex/Printing/DumpAST.swift | 6 ++--- Sources/_StringProcessing/ByteCodeGen.swift | 4 +-- .../_StringProcessing/ConsumerInterface.swift | 2 +- .../_StringProcessing/PrintAsPattern.swift | 10 ++++--- Sources/_StringProcessing/Regex/DSLTree.swift | 4 +-- 8 files changed, 30 insertions(+), 28 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 2e39c9c4c..6d8f62c42 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -63,10 +63,10 @@ extension AST { case dot /// ^ - case startOfLine + case caretAnchor /// $ - case endOfLine + case dollarAnchor // References case backreference(Reference) @@ -105,8 +105,8 @@ extension AST.Atom { case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v case .dot: return nil - case .startOfLine: return nil - case .endOfLine: return nil + case .caretAnchor: return nil + case .dollarAnchor: return nil case .invalid: return nil } } @@ -536,10 +536,10 @@ extension AST.Atom { case notTextSegment = #"\Y"# /// ^ - case startOfLine = #"^"# + case caretAnchor = #"^"# /// $ - case endOfLine = #"$"# + case dollarAnchor = #"$"# /// \b (from outside a custom character class) case wordBoundary = #"\b"# @@ -551,8 +551,8 @@ extension AST.Atom { public var assertionKind: AssertionKind? { switch kind { - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor case .escaped(.wordBoundary): return .wordBoundary case .escaped(.notWordBoundary): return .notWordBoundary @@ -806,9 +806,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .dot, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions, .invalid: + case .scalarSequence, .property, .dot, .caretAnchor, + .dollarAnchor, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } @@ -858,7 +858,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .dot, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -874,7 +874,7 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: return false default: return true diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index d14a17785..4a4f5c05f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2074,8 +2074,8 @@ extension Parser { // (sometimes) special metacharacters case ".": return customCC ? .char(".") : .dot - case "^": return customCC ? .char("^") : .startOfLine - case "$": return customCC ? .char("$") : .endOfLine + case "^": return customCC ? .char("^") : .caretAnchor + case "$": return customCC ? .char("$") : .dollarAnchor // Escaped case "\\": return p.expectEscaped().value diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 88744bae2..ea541fba7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .dot: + case .char, .scalar, .caretAnchor, .dollarAnchor, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 68c32e4a1..cf5a56721 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,9 +153,9 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .dot: return "." - case .startOfLine: return "^" - case .endOfLine: return "$" + case .dot: return "." + case .caretAnchor: return "^" + case .dollarAnchor: return "$" case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5e0b559cb..dd4915851 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -170,7 +170,7 @@ fileprivate extension Compiler.ByteCodeGen { !input.isOnGraphemeClusterBoundary(pos) } - case .startOfLine: + case .caretAnchor: // FIXME: Anchor.startOfLine must always use this first branch // The behavior of `^` should depend on `anchorsMatchNewlines`, but // the DSL-based `.startOfLine` anchor should always match the start @@ -192,7 +192,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - case .endOfLine: + case .dollarAnchor: // FIXME: Anchor.endOfLine must always use this first branch // The behavior of `$` should depend on `anchorsMatchNewlines`, but // the DSL-based `.endOfLine` anchor should always match the end diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index afc507c41..ae7149a00 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -272,7 +272,7 @@ extension AST.Atom { "Should have been handled by tree conversion") fatalError(".atom(.dot) is handled in emitDot") - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 9332756f1..fc257cad4 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -627,9 +627,11 @@ extension AST.Atom.AssertionKind { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { - case .startOfLine: + case .caretAnchor: + // FIXME: The DSL doesn't have a way of representing this. return "Anchor.startOfLine" - case .endOfLine: + case .dollarAnchor: + // FIXME: The DSL doesn't have a way of representing this. return "Anchor.endOfLine" case .wordBoundary: return "Anchor.wordBoundary" @@ -899,7 +901,7 @@ extension AST.Atom { // The DSL does not have an equivalent to '.', print as a regex. return ("/./", false) - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -954,7 +956,7 @@ extension AST.Atom { case .dot: return "." - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 7a4e7d30e..93954bbf9 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -801,10 +801,10 @@ extension DSLTree { : .init(ast: .textSegment) } public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .startOfLine) + .init(ast: .caretAnchor) } public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .endOfLine) + .init(ast: .dollarAnchor) } public static func wordBoundary(_ inverted: Bool = false) -> Self { inverted From 3f3d253a6a021fc367108357e697707c5a71318d Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:41 +0100 Subject: [PATCH 10/14] Move AssertionKind onto the DSL This enum will start including cases that only the DSL can use, so move it off the AST. --- Sources/RegexBuilder/Anchor.swift | 32 +++++--- Sources/_RegexParser/Regex/AST/Atom.swift | 61 --------------- .../Regex/Printing/PrintAsCanonical.swift | 7 +- Sources/_StringProcessing/ByteCodeGen.swift | 4 +- .../_StringProcessing/PrintAsPattern.swift | 6 +- .../Regex/ASTConversion.swift | 32 +++++++- Sources/_StringProcessing/Regex/DSLTree.swift | 74 ++++++++++--------- .../Utility/RegexFactory.swift | 2 +- 8 files changed, 102 insertions(+), 116 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 31a3e8a0d..a7505c5a1 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -37,16 +37,30 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var baseAssertion: DSLTree._AST.AssertionKind { + var baseAssertion: DSLTree.Atom.Assertion { switch kind { - case .startOfSubject: return .startOfSubject(isInverted) - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) - case .endOfSubject: return .endOfSubject(isInverted) - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) - case .textSegmentBoundary: return .textSegmentBoundary(isInverted) - case .startOfLine: return .startOfLine(isInverted) - case .endOfLine: return .endOfLine(isInverted) - case .wordBoundary: return .wordBoundary(isInverted) + case .startOfSubject: + // FIXME: Inverted? + return .startOfSubject + case .endOfSubjectBeforeNewline: + // FIXME: Inverted? + return .endOfSubjectBeforeNewline + case .endOfSubject: + // FIXME: Inverted? + return .endOfSubject + case .firstMatchingPositionInSubject: + // FIXME: Inverted? + return .firstMatchingPositionInSubject + case .textSegmentBoundary: + return isInverted ? .notTextSegment : .textSegment + case .startOfLine: + // FIXME: Inverted? + return .caretAnchor + case .endOfLine: + // FIXME: Inverted? + return .dollarAnchor + case .wordBoundary: + return isInverted ? .notWordBoundary : .wordBoundary } } diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 6d8f62c42..b03ce8c39 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty { } } -extension AST.Atom { - /// Anchors and other built-in zero-width assertions. - public enum AssertionKind: String, Hashable { - /// \A - case startOfSubject = #"\A"# - - /// \Z - case endOfSubjectBeforeNewline = #"\Z"# - - /// \z - case endOfSubject = #"\z"# - - /// \K - case resetStartOfMatch = #"\K"# - - /// \G - case firstMatchingPositionInSubject = #"\G"# - - /// \y - case textSegment = #"\y"# - - /// \Y - case notTextSegment = #"\Y"# - - /// ^ - case caretAnchor = #"^"# - - /// $ - case dollarAnchor = #"$"# - - /// \b (from outside a custom character class) - case wordBoundary = #"\b"# - - /// \B - case notWordBoundary = #"\B"# - - } - - public var assertionKind: AssertionKind? { - switch kind { - case .caretAnchor: return .caretAnchor - case .dollarAnchor: return .dollarAnchor - - case .escaped(.wordBoundary): return .wordBoundary - case .escaped(.notWordBoundary): return .notWordBoundary - case .escaped(.startOfSubject): return .startOfSubject - case .escaped(.endOfSubject): return .endOfSubject - case .escaped(.textSegment): return .textSegment - case .escaped(.notTextSegment): return .notTextSegment - case .escaped(.endOfSubjectBeforeNewline): - return .endOfSubjectBeforeNewline - case .escaped(.firstMatchingPositionInSubject): - return .firstMatchingPositionInSubject - - case .escaped(.resetStartOfMatch): return .resetStartOfMatch - - default: return nil - } - } -} - extension AST.Atom { public enum Callout: Hashable { /// A PCRE callout written `(?C...)` diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 0e7cfb1d3..6b8c8ab93 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -237,9 +237,6 @@ extension AST.Atom.Number { extension AST.Atom { var _canonicalBase: String { - if let anchor = self.assertionKind { - return anchor.rawValue - } if let lit = self.literalStringValue { // FIXME: We may have to re-introduce escapes // For example, `\.` will come back as "." instead @@ -248,6 +245,10 @@ extension AST.Atom { return lit } switch self.kind { + case .caretAnchor: + return "^" + case .dollarAnchor: + return "$" case .escaped(let e): return "\\\(e.character)" case .backreference(let br): diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index dd4915851..5636c6d6c 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -68,7 +68,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitScalar(s) case let .assertion(kind): - try emitAssertion(kind.ast) + try emitAssertion(kind) case let .backreference(ref): try emitBackreference(ref.ast) @@ -114,7 +114,7 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitAssertion( - _ kind: AST.Atom.AssertionKind + _ kind: DSLTree.Atom.Assertion ) throws { // FIXME: Depends on API model we have... We may want to // think through some of these with API interactions in mind diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index fc257cad4..439316b4a 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -623,7 +623,7 @@ extension String { } } -extension AST.Atom.AssertionKind { +extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { @@ -811,7 +811,7 @@ extension AST.Atom { /// /// TODO: Some way to integrate this with conversion... var _patternBase: (String, canBeWrapped: Bool) { - if let anchor = self.assertionKind { + if let anchor = self.dslAssertionKind { return (anchor._patternBase, false) } @@ -1124,7 +1124,7 @@ extension DSLTree.Atom { } case .assertion(let a): - return (a.ast._patternBase, false) + return (a._patternBase, false) case .backreference(_): return ("/* TOOD: backreferences */", false) diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 12068e1bc..2146fd61b 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -208,10 +208,38 @@ extension AST.CustomCharacterClass { } } +extension AST.Atom.EscapedBuiltin { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch self { + case .wordBoundary: return .wordBoundary + case .notWordBoundary: return .notWordBoundary + case .startOfSubject: return .startOfSubject + case .endOfSubject: return .endOfSubject + case .textSegment: return .textSegment + case .notTextSegment: return .notTextSegment + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject + case .resetStartOfMatch: return .resetStartOfMatch + default: return nil + } + } +} + +extension AST.Atom { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch kind { + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor + case .escaped(let b): return b.dslAssertionKind + default: return nil + } + } +} + extension AST.Atom { var dslTreeAtom: DSLTree.Atom { - if let kind = assertionKind { - return .assertion(.init(ast: kind)) + if let kind = dslAssertionKind { + return .assertion(kind) } switch self.kind { diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 93954bbf9..f55c3bc01 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -253,7 +253,7 @@ extension DSLTree { /// newlines unless single line mode is enabled. case dot - case assertion(_AST.AssertionKind) + case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -263,6 +263,44 @@ extension DSLTree { } } +extension DSLTree.Atom { + @_spi(RegexBuilder) + public enum Assertion: Hashable { + /// \A + case startOfSubject + + /// \Z + case endOfSubjectBeforeNewline + + /// \z + case endOfSubject + + /// \K + case resetStartOfMatch + + /// \G + case firstMatchingPositionInSubject + + /// \y + case textSegment + + /// \Y + case notTextSegment + + /// ^ + case caretAnchor + + /// $ + case dollarAnchor + + /// \b (from outside a custom character class) + case wordBoundary + + /// \B + case notWordBoundary + } +} + extension Unicode.GeneralCategory { var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { switch self { @@ -779,40 +817,6 @@ extension DSLTree { internal var ast: AST.AbsentFunction } - @_spi(RegexBuilder) - public struct AssertionKind { - internal var ast: AST.Atom.AssertionKind - - public static func startOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .startOfSubject) - } - public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubjectBeforeNewline) - } - public static func endOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubject) - } - public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { - .init(ast: .firstMatchingPositionInSubject) - } - public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notTextSegment) - : .init(ast: .textSegment) - } - public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .caretAnchor) - } - public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .dollarAnchor) - } - public static func wordBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notWordBoundary) - : .init(ast: .wordBoundary) - } - } - @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 693b04966..31245c0f7 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -40,7 +40,7 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) public func assertion( - _ kind: DSLTree._AST.AssertionKind + _ kind: DSLTree.Atom.Assertion ) -> Regex { .init(node: .atom(.assertion(kind))) } From 540ab8c4abc7d2ee1a24f7e154250405b8cb068f Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 11/14] Fix `Anchor.startOfLine` and `Anchor.endOfLine` Introduce `startOfInput` and `endOfInput` assertion kinds, and map the DSL to them such that they do not depend on matching options. rdar://97029630 --- Sources/RegexBuilder/Anchor.swift | 4 +- Sources/_StringProcessing/ByteCodeGen.swift | 64 ++++++++++--------- .../_StringProcessing/PrintAsPattern.swift | 12 ++-- Sources/_StringProcessing/Regex/DSLTree.swift | 8 +++ Tests/RegexBuilderTests/RegexDSLTests.swift | 35 ++++++++-- Tests/RegexTests/RenderDSLTests.swift | 16 ++++- 6 files changed, 96 insertions(+), 43 deletions(-) diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index a7505c5a1..4508e3dd7 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -55,10 +55,10 @@ extension Anchor: RegexComponent { return isInverted ? .notTextSegment : .textSegment case .startOfLine: // FIXME: Inverted? - return .caretAnchor + return .startOfLine case .endOfLine: // FIXME: Inverted? - return .dollarAnchor + return .endOfLine case .wordBoundary: return isInverted ? .notWordBoundary : .wordBoundary } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 5636c6d6c..0e0673988 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -113,6 +113,32 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitStartOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } + } + } + + mutating func emitEndOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } + } + } + mutating func emitAssertion( _ kind: DSLTree.Atom.Assertion ) throws { @@ -170,44 +196,24 @@ fileprivate extension Compiler.ByteCodeGen { !input.isOnGraphemeClusterBoundary(pos) } + case .startOfLine: + emitStartOfLine() + + case .endOfLine: + emitEndOfLine() + case .caretAnchor: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } + emitStartOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } - + case .dollarAnchor: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } + emitEndOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 439316b4a..3e62d1886 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -627,12 +627,16 @@ extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { - case .caretAnchor: - // FIXME: The DSL doesn't have a way of representing this. + case .startOfLine: return "Anchor.startOfLine" - case .dollarAnchor: - // FIXME: The DSL doesn't have a way of representing this. + case .endOfLine: return "Anchor.endOfLine" + case .caretAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/^/" + case .dollarAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/$/" case .wordBoundary: return "Anchor.wordBoundary" case .notWordBoundary: diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f55c3bc01..549a8b3a1 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -287,6 +287,14 @@ extension DSLTree.Atom { /// \Y case notTextSegment + /// The DSL's Anchor.startOfLine, which matches the start of a line + /// even if `anchorsMatchNewlines` is false. + case startOfLine + + /// The DSL's Anchor.endOfLine, which matches the end of a line + /// even if `anchorsMatchNewlines` is false. + case endOfLine + /// ^ case caretAnchor diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 013517bdc..e6a8108ed 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -798,19 +798,40 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - - // FIXME: Anchor.start/endOfLine needs to always match line endings, - // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( - ("\naaa", "aaa"), - ("aaa\n", "aaa"), - ("\naaa\n", "aaa"), - matchType: Substring.self, ==, xfail: true) + ("\naaa", "\naaa"), + ("aaa\n", "aaa\n"), + ("\naaa\n", "\naaa\n"), + matchType: Substring.self, ==) { Regex { + Optionally { "\n" } Anchor.startOfLine Repeat("a", count: 3) Anchor.endOfLine + Optionally { "\n" } + } + } + + // startOfLine/endOfLine apply regardless of mode. + for matchLineEndings in [true, false] { + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + let r = Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) + + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) + XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) + + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) + XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 460cc8e14..e33b10c31 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -85,7 +85,21 @@ extension RenderDSLTests { } """#) } - + + func testAnchor() throws { + try testConversion(#"^(?:a|b|c)$"#, #""" + Regex { + /^/ + ChoiceOf { + "a" + "b" + "c" + } + /$/ + } + """#) + } + func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { try testConversion(#"(?i)abc"#, """ From f1026b3cb9e6c27fb0007434f1d72097745fba38 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 12/14] Add some tests for `CharacterClass.anyGraphemeCluster` --- Tests/RegexBuilderTests/RegexDSLTests.swift | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index e6a8108ed..20a4e5d24 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -255,6 +255,34 @@ class RegexDSLTests: XCTestCase { }.dotMatchesNewlines(dotMatchesNewline) } } + + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + try _testDSLCaptures( + ("a", "a"), + ("\r\n", "\r\n"), + ("e\u{301}", "e\u{301}"), + ("e\u{301}f", nil), + ("e\u{303}\u{301}\u{302}", "e\u{303}\u{301}\u{302}"), + matchType: Substring.self, ==) + { + Regex { + One(.anyGraphemeCluster) + }.matchingSemantics(mode) + } + + // Like `.any` it also always matches newlines. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyGraphemeCluster) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } } func testMatchResultDotZeroWithoutCapture() throws { From ae3157c5e3efc704e849acb83085eec047e823ef Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:42 +0100 Subject: [PATCH 13/14] Add some tests for `CharacterClass.horizontalWhitespace` --- Tests/RegexBuilderTests/RegexDSLTests.swift | 24 +++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 20a4e5d24..67a4e8264 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -220,6 +220,30 @@ class RegexDSLTests: XCTestCase { }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) } } + + // Make sure horizontal whitespace does not match newlines or other + // vertical whitespace. + try _testDSLCaptures( + (" \u{A0} \u{9} \t ", " \u{A0} \u{9} \t "), + (" \n", nil), + (" \r", nil), + (" \r\n", nil), + (" \u{2028}", nil), + matchType: Substring.self, ==) + { + OneOrMore(.horizontalWhitespace) + } + + // Horizontal whitespace in ASCII mode. + try _testDSLCaptures( + (" \u{9} \t ", " \u{9} \t "), + ("\u{A0}", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.horizontalWhitespace) + }.asciiOnlyWhitespace() + } } func testCharacterClassOperations() throws { From 3c7d34ffa6842bb2673fda2d275e4b527b54deae Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Fri, 15 Jul 2022 10:27:43 +0100 Subject: [PATCH 14/14] Implement `CharacterClass.anyNonNewline` rdar://97029702 --- Sources/RegexBuilder/CharacterClass.swift | 4 ++ Sources/_StringProcessing/ByteCodeGen.swift | 17 ++++-- .../_StringProcessing/ConsumerInterface.swift | 16 ++++++ .../_StringProcessing/PrintAsPattern.swift | 6 ++ Sources/_StringProcessing/Regex/DSLTree.swift | 8 ++- Tests/RegexBuilderTests/RegexDSLTests.swift | 57 +++++++++++++++++++ 6 files changed, 101 insertions(+), 7 deletions(-) diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 4e96e510d..ea52c28f3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -45,6 +45,10 @@ extension RegexComponent where Self == CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)])) + } + public static var anyGraphemeCluster: CharacterClass { .init(unconverted: ._anyGrapheme) } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 0e0673988..6263186e8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -58,6 +58,9 @@ fileprivate extension Compiler.ByteCodeGen { case .any: emitAny() + case .anyNonNewline: + emitAnyNonNewline() + case .dot: emitDot() @@ -303,11 +306,7 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitDot() { - if options.dotMatchesNewline { - emitAny() - return - } + mutating func emitAnyNonNewline() { switch options.semanticLevel { case .graphemeCluster: builder.buildConsume { input, bounds in @@ -324,6 +323,14 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + } else { + emitAnyNonNewline() + } + } + mutating func emitAlternation( _ children: [DSLTree.Node] ) throws { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index ae7149a00..fb9267f4f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -123,6 +123,22 @@ extension DSLTree.Atom { } } + case .anyNonNewline: + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + case .unicodeScalar: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + case .dot: throw Unreachable(".atom(.dot) should be handled by emitDot") diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 3e62d1886..21c611d43 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -1109,6 +1109,9 @@ extension DSLTree.Atom { case .any: return (".any", true) + case .anyNonNewline: + return (".anyNonNewline", true) + case .dot: // The DSL does not have an equivalent to '.', print as a regex. return ("/./", false) @@ -1155,6 +1158,9 @@ extension DSLTree.Atom { case .any: return "(?s:.)" + case .anyNonNewline: + return "(?-s:.)" + case .dot: return "." diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 549a8b3a1..449baa6a7 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -249,6 +249,10 @@ extension DSLTree { /// Any character, including newlines. case any + /// Any character, excluding newlines. This differs from '.', as it is not + /// affected by single line mode. + case anyNonNewline + /// The DSL representation of '.' in a regex literal. This does not match /// newlines unless single line mode is enabled. case dot @@ -875,8 +879,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .dot, .backreference, .symbolicReference, - .unconverted: + case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, + .symbolicReference, .unconverted: return true } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 67a4e8264..1cf039b35 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -309,6 +309,63 @@ class RegexDSLTests: XCTestCase { } } + func testAnyNonNewline() throws { + // `.anyNonNewline` is `.` without single-line mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abcdef", "abcdef"), + ("abcdef\n", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abcdef", nil), + ("abcdef\n", nil), + ("\r\n", "\r\n"), + ("\r", "\r"), + ("\n", "\n"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline.inverted) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abc", "abc"), + ("abcd", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(CharacterClass.anyNonNewline.intersection(.anyOf("\n\rabc"))) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + + try _testDSLCaptures( + ("\r\n", "\r\n"), matchType: Substring.self, ==) { + CharacterClass.anyNonNewline.inverted + } + try _testDSLCaptures( + ("\r\n", nil), matchType: Substring.self, ==) { + Regex { + CharacterClass.anyNonNewline.inverted + }.matchingSemantics(.unicodeScalar) + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa")