diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift new file mode 100644 index 000000000..78ebd49a2 --- /dev/null +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -0,0 +1,215 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import _RegexParser +@_spi(RegexBuilder) import _StringProcessing + +public struct CharacterClass { + internal var ccc: DSLTree.CustomCharacterClass + + init(_ ccc: DSLTree.CustomCharacterClass) { + self.ccc = ccc + } + + init(unconverted model: _CharacterClassModel) { + // FIXME: Implement in DSLTree instead of wrapping an AST atom + switch model.makeAST() { + case .atom(let atom): + self.ccc = .init(members: [.atom(.unconverted(atom))]) + default: + fatalError("Unsupported _CharacterClassModel") + } + } + + init(property: AST.Atom.CharacterProperty) { + // FIXME: Implement in DSLTree instead of wrapping an AST atom + let astAtom = AST.Atom(.property(property), .fake) + self.ccc = .init(members: [.atom(.unconverted(astAtom))]) + } +} + +extension CharacterClass: RegexComponent { + public var regex: Regex { + return Regex(node: DSLTree.Node.customCharacterClass(ccc)) + } +} + +extension CharacterClass { + public var inverted: CharacterClass { + CharacterClass(ccc.inverted) + } +} + +extension RegexComponent where Self == CharacterClass { + public static var any: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) + } + + public static var anyGrapheme: CharacterClass { + .init(unconverted: .anyGrapheme) + } + + public static var whitespace: CharacterClass { + .init(unconverted: .whitespace) + } + + public static var digit: CharacterClass { + .init(unconverted: .digit) + } + + public static var hexDigit: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [ + .range(.char("A"), .char("F")), + .range(.char("a"), .char("f")), + .range(.char("0"), .char("9")), + ])) + } + + public static var horizontalWhitespace: CharacterClass { + .init(unconverted: .horizontalWhitespace) + } + + public static var newlineSequence: CharacterClass { + .init(unconverted: .newlineSequence) + } + + public static var verticalWhitespace: CharacterClass { + .init(unconverted: .verticalWhitespace) + } + + public static var word: CharacterClass { + .init(unconverted: .word) + } +} + +extension RegexComponent where Self == CharacterClass { + /// Returns a character class that matches any character in the given string + /// or sequence. + public static func anyOf(_ s: S) -> CharacterClass + where S.Element == Character + { + CharacterClass(DSLTree.CustomCharacterClass( + members: s.map { .atom(.char($0)) })) + } + + /// Returns a character class that matches any unicode scalar in the given + /// sequence. + public static func anyOf(_ s: S) -> CharacterClass + where S.Element == UnicodeScalar + { + CharacterClass(DSLTree.CustomCharacterClass( + members: s.map { .atom(.scalar($0)) })) + } +} + +// Unicode properties +extension CharacterClass { + public static func generalCategory(_ category: Unicode.GeneralCategory) -> CharacterClass { + guard let extendedCategory = category.extendedGeneralCategory else { + fatalError("Unexpected general category") + } + return CharacterClass(property: + .init(.generalCategory(extendedCategory), isInverted: false, isPOSIX: false)) + } +} + +/// Range syntax for characters in `CharacterClass`es. +public func ...(lhs: Character, rhs: Character) -> CharacterClass { + let range: DSLTree.CustomCharacterClass.Member = .range(.char(lhs), .char(rhs)) + let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false) + return CharacterClass(ccc) +} + +/// Range syntax for unicode scalars in `CharacterClass`es. +@_disfavoredOverload +public func ...(lhs: UnicodeScalar, rhs: UnicodeScalar) -> CharacterClass { + let range: DSLTree.CustomCharacterClass.Member = .range(.scalar(lhs), .scalar(rhs)) + let ccc = DSLTree.CustomCharacterClass(members: [range], isInverted: false) + return CharacterClass(ccc) +} + +extension Unicode.GeneralCategory { + var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { + switch self { + case .uppercaseLetter: return .uppercaseLetter + case .lowercaseLetter: return .lowercaseLetter + case .titlecaseLetter: return .titlecaseLetter + case .modifierLetter: return .modifierLetter + case .otherLetter: return .otherLetter + case .nonspacingMark: return .nonspacingMark + case .spacingMark: return .spacingMark + case .enclosingMark: return .enclosingMark + case .decimalNumber: return .decimalNumber + case .letterNumber: return .letterNumber + case .otherNumber: return .otherNumber + case .connectorPunctuation: return .connectorPunctuation + case .dashPunctuation: return .dashPunctuation + case .openPunctuation: return .openPunctuation + case .closePunctuation: return .closePunctuation + case .initialPunctuation: return .initialPunctuation + case .finalPunctuation: return .finalPunctuation + case .otherPunctuation: return .otherPunctuation + case .mathSymbol: return .mathSymbol + case .currencySymbol: return .currencySymbol + case .modifierSymbol: return .modifierSymbol + case .otherSymbol: return .otherSymbol + case .spaceSeparator: return .spaceSeparator + case .lineSeparator: return .lineSeparator + case .paragraphSeparator: return .paragraphSeparator + case .control: return .control + case .format: return .format + case .surrogate: return .surrogate + case .privateUse: return .privateUse + case .unassigned: return .unassigned + @unknown default: return nil + } + } +} + +// MARK: - Set algebra methods + +extension RegexComponent where Self == CharacterClass { + public init(_ first: CharacterClass, _ rest: CharacterClass...) { + if rest.isEmpty { + self.init(first.ccc) + } else { + let members: [DSLTree.CustomCharacterClass.Member] = + (CollectionOfOne(first) + rest).map { .custom($0.ccc) } + self.init(.init(members: members)) + } + } +} + +extension CharacterClass { + public func union(_ other: CharacterClass) -> CharacterClass { + CharacterClass(.init(members: [ + .custom(self.ccc), + .custom(other.ccc)])) + } + + public func intersection(_ other: CharacterClass) -> CharacterClass { + CharacterClass(.init(members: [ + .intersection(self.ccc, other.ccc) + ])) + } + + public func subtracting(_ other: CharacterClass) -> CharacterClass { + CharacterClass(.init(members: [ + .subtraction(self.ccc, other.ccc) + ])) + } + + public func symmetricDifference(_ other: CharacterClass) -> CharacterClass { + CharacterClass(.init(members: [ + .symmetricDifference(self.ccc, other.ccc) + ])) + } +} diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d6d6c3c5e..ed84fadbd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -136,7 +136,7 @@ extension Compiler.ByteCodeGen { // TODO: May want to consider Unicode level builder.buildAssert { [options] (input, pos, bounds) in // TODO: How should we handle bounds? - CharacterClass.word.isBoundary( + _CharacterClassModel.word.isBoundary( input, at: pos, bounds: bounds, with: options) } @@ -144,7 +144,7 @@ extension Compiler.ByteCodeGen { // TODO: May want to consider Unicode level builder.buildAssert { [options] (input, pos, bounds) in // TODO: How should we handle bounds? - !CharacterClass.word.isBoundary( + !_CharacterClassModel.word.isBoundary( input, at: pos, bounds: bounds, with: options) } } @@ -595,7 +595,15 @@ extension Compiler.ByteCodeGen { try emitQuantification(amt, kind, child) case let .customCharacterClass(ccc): - try emitCustomCharacterClass(ccc) + if ccc.containsAny { + if !ccc.isInverted { + emitAny() + } else { + throw Unsupported("Inverted any") + } + } else { + try emitCustomCharacterClass(ccc) + } case let .atom(a): try emitAtom(a) diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index c4b2b8de7..cafb07da0 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -105,7 +105,7 @@ extension MatchingOptions { // Deprecated CharacterClass.MatchLevel API extension MatchingOptions { @available(*, deprecated) - var matchLevel: CharacterClass.MatchLevel { + var matchLevel: _CharacterClassModel.MatchLevel { switch semanticLevel { case .graphemeCluster: return .graphemeCluster diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index bd3b37a3d..59ebd38af 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -107,8 +107,31 @@ extension DSLTree { public struct CustomCharacterClass { var members: [Member] var isInverted: Bool + + var containsAny: Bool { + members.contains { member in + switch member { + case .atom(.any): return true + case .custom(let ccc): return ccc.containsAny + default: + return false + } + } + } + + public init(members: [DSLTree.CustomCharacterClass.Member], isInverted: Bool = false) { + self.members = members + self.isInverted = isInverted + } + + public var inverted: CustomCharacterClass { + var result = self + result.isInverted.toggle() + return result + } - enum Member { + @_spi(RegexBuilder) + public enum Member { case atom(Atom) case range(Atom, Atom) case custom(CustomCharacterClass) diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/_CharacterClassModel.swift similarity index 90% rename from Sources/_StringProcessing/CharacterClass.swift rename to Sources/_StringProcessing/_CharacterClassModel.swift index bdf34d0a7..94a42b549 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,7 +15,8 @@ import _RegexParser // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -public struct CharacterClass: Hashable { +@_spi(RegexBuilder) +public struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -54,6 +55,7 @@ public struct CharacterClass: Hashable { public typealias SetOperator = AST.CustomCharacterClass.SetOp /// A binary set operation that forms a character class component. + @_spi(RegexBuilder) public struct SetOperation: Hashable { var lhs: CharacterSetComponent var op: SetOperator @@ -71,12 +73,13 @@ public struct CharacterClass: Hashable { } } + @_spi(RegexBuilder) public enum CharacterSetComponent: Hashable { case character(Character) case range(ClosedRange) /// A nested character class. - case characterClass(CharacterClass) + case characterClass(_CharacterClassModel) /// A binary set operation of character class components. indirect case setOperation(SetOperation) @@ -201,7 +204,7 @@ public struct CharacterClass: Hashable { } } -extension CharacterClass: RegexComponent { +extension _CharacterClassModel: RegexComponent { public typealias Output = Substring public var regex: Regex { @@ -212,51 +215,52 @@ extension CharacterClass: RegexComponent { } } -extension RegexComponent where Self == CharacterClass { - public static var any: CharacterClass { +@_spi(RegexBuilder) +extension _CharacterClassModel { + public static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: CharacterClass { + public static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var whitespace: CharacterClass { + public static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: CharacterClass { + public static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: CharacterClass { + public static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: CharacterClass { + public static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: CharacterClass { + public static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: CharacterClass { + public static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: CharacterClass { + public static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } public static func custom( - _ components: [CharacterClass.CharacterSetComponent] - ) -> CharacterClass { + _ components: [_CharacterClassModel.CharacterSetComponent] + ) -> _CharacterClassModel { .init(cc: .custom(components), matchLevel: .graphemeCluster) } } -extension CharacterClass.CharacterSetComponent: CustomStringConvertible { +extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { public var description: String { switch self { case .range(let range): return "" @@ -267,7 +271,7 @@ extension CharacterClass.CharacterSetComponent: CustomStringConvertible { } } -extension CharacterClass.Representation: CustomStringConvertible { +extension _CharacterClassModel.Representation: CustomStringConvertible { public var description: String { switch self { case .any: return "" @@ -284,13 +288,13 @@ extension CharacterClass.Representation: CustomStringConvertible { } } -extension CharacterClass: CustomStringConvertible { +extension _CharacterClassModel: CustomStringConvertible { public var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension CharacterClass { +extension _CharacterClassModel { public func makeAST() -> AST.Node? { let inv = isInverted @@ -343,7 +347,7 @@ extension CharacterClass { } extension DSLTree.Node { - var characterClass: CharacterClass? { + var characterClass: _CharacterClassModel? { switch self { case let .customCharacterClass(ccc): return ccc.modelCharacterClass @@ -358,10 +362,10 @@ extension DSLTree.Node { } } -extension CharacterClass { +extension _CharacterClassModel { public func withMatchLevel( - _ level: CharacterClass.MatchLevel - ) -> CharacterClass { + _ level: _CharacterClassModel.MatchLevel + ) -> _CharacterClassModel { var cc = self cc.matchLevel = level return cc @@ -369,7 +373,7 @@ extension CharacterClass { } extension DSLTree.Atom { - var characterClass: CharacterClass? { + var characterClass: _CharacterClassModel? { switch self { case let .unconverted(a): return a.characterClass @@ -380,7 +384,7 @@ extension DSLTree.Atom { } extension AST.Atom { - var characterClass: CharacterClass? { + var characterClass: _CharacterClassModel? { switch kind { case let .escaped(b): return b.characterClass @@ -406,7 +410,7 @@ extension AST.Atom { } extension AST.Atom.EscapedBuiltin { - var characterClass: CharacterClass? { + var characterClass: _CharacterClassModel? { switch self { case .decimalDigit: return .digit case .notDecimalDigit: return .digit.inverted @@ -437,9 +441,9 @@ extension AST.Atom.EscapedBuiltin { extension DSLTree.CustomCharacterClass { // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: CharacterClass? { + var modelCharacterClass: _CharacterClassModel? { var result = - Array() + Array<_CharacterClassModel.CharacterSetComponent>() for m in members { switch m { case let .atom(a): @@ -505,12 +509,12 @@ extension DSLTree.CustomCharacterClass { break } } - let cc = CharacterClass.custom(result) + let cc = _CharacterClassModel.custom(result) return isInverted ? cc.inverted : cc } } -extension CharacterClass { +extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the // string. (Think through what we want: do it ourselves or diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 2044c8859..aaa3f6886 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -70,6 +70,59 @@ class RegexDSLTests: XCTestCase { Capture(.whitespace) // Substring Capture("c") // Substring } + + try _testDSLCaptures( + ("abc1def2", "abc1def2"), + matchType: Substring.self, ==) + { + // First group + OneOrMore { + CharacterClass("a"..."z", .digit) + } + + // Second group + OneOrMore { + ChoiceOf { + "a"..."z" + CharacterClass.hexDigit + } + } + } + + try _testDSLCaptures( + ("abc1def2", ("abc1def2", "abc1")), + matchType: (Substring, Substring).self, ==) + { + Capture { + OneOrMore(.digit.inverted) + ("a"..."z").inverted + } + + OneOrMore { + CharacterClass.whitespace.inverted + } + } + } + + func testCharacterClassOperations() throws { + try _testDSLCaptures( + ("bcdefn1a", "bcdefn1a"), + ("nbcdef1a", nil), // fails symmetric difference lookahead + ("abcdef1a", nil), // fails union + ("bcdef3a", nil), // fails subtraction + ("bcdef1z", nil), // fails intersection + matchType: Substring.self, ==) + { + let disallowedChars = CharacterClass.hexDigit + .symmetricDifference("a"..."z") + Lookahead(disallowedChars, negative: true) // No: 0-9 + g-z + + OneOrMore(("b"..."g").union("d"..."n")) // b-n + + CharacterClass.digit.subtracting("3"..."9") // 1, 2, non-ascii digits + + CharacterClass.hexDigit.intersection("a"..."z") // a-f + } } func testMatchResultDotZeroWithoutCapture() throws {