diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 31a3e8a0d..4508e3dd7 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -37,16 +37,30 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var baseAssertion: DSLTree._AST.AssertionKind { + var baseAssertion: DSLTree.Atom.Assertion { switch kind { - case .startOfSubject: return .startOfSubject(isInverted) - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) - case .endOfSubject: return .endOfSubject(isInverted) - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) - case .textSegmentBoundary: return .textSegmentBoundary(isInverted) - case .startOfLine: return .startOfLine(isInverted) - case .endOfLine: return .endOfLine(isInverted) - case .wordBoundary: return .wordBoundary(isInverted) + case .startOfSubject: + // FIXME: Inverted? + return .startOfSubject + case .endOfSubjectBeforeNewline: + // FIXME: Inverted? + return .endOfSubjectBeforeNewline + case .endOfSubject: + // FIXME: Inverted? + return .endOfSubject + case .firstMatchingPositionInSubject: + // FIXME: Inverted? + return .firstMatchingPositionInSubject + case .textSegmentBoundary: + return isInverted ? .notTextSegment : .textSegment + case .startOfLine: + // FIXME: Inverted? + return .startOfLine + case .endOfLine: + // FIXME: Inverted? + return .endOfLine + case .wordBoundary: + return isInverted ? .notWordBoundary : .wordBoundary } } diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index a6d18b2cf..ea52c28f3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -20,11 +20,8 @@ public struct CharacterClass { self.ccc = ccc } - init(unconverted model: _CharacterClassModel) { - guard let ccc = model.makeDSLTreeCharacterClass() else { - fatalError("Unsupported character class") - } - self.ccc = ccc + init(unconverted atom: DSLTree._AST.Atom) { + self.ccc = .init(members: [.atom(.unconverted(atom))]) } } @@ -48,16 +45,20 @@ extension RegexComponent where Self == CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)])) + } + public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: .anyGrapheme) + .init(unconverted: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) + .init(unconverted: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: .digit) + .init(unconverted: ._digit) } public static var hexDigit: CharacterClass { @@ -69,19 +70,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + .init(unconverted: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + .init(unconverted: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: .verticalWhitespace) + .init(unconverted: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: .word) + .init(unconverted: ._word) } } diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f1419ad78..b03ce8c39 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -60,13 +60,13 @@ extension AST { case namedCharacter(String) /// . - case any + case dot /// ^ - case startOfLine + case caretAnchor /// $ - case endOfLine + case dollarAnchor // References case backreference(Reference) @@ -104,9 +104,9 @@ extension AST.Atom { case .callout(let v): return v case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v - case .any: return nil - case .startOfLine: return nil - case .endOfLine: return nil + case .dot: return nil + case .caretAnchor: return nil + case .dollarAnchor: return nil case .invalid: return nil } } @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty { } } -extension AST.Atom { - /// Anchors and other built-in zero-width assertions. - public enum AssertionKind: String, Hashable { - /// \A - case startOfSubject = #"\A"# - - /// \Z - case endOfSubjectBeforeNewline = #"\Z"# - - /// \z - case endOfSubject = #"\z"# - - /// \K - case resetStartOfMatch = #"\K"# - - /// \G - case firstMatchingPositionInSubject = #"\G"# - - /// \y - case textSegment = #"\y"# - - /// \Y - case notTextSegment = #"\Y"# - - /// ^ - case startOfLine = #"^"# - - /// $ - case endOfLine = #"$"# - - /// \b (from outside a custom character class) - case wordBoundary = #"\b"# - - /// \B - case notWordBoundary = #"\B"# - - } - - public var assertionKind: AssertionKind? { - switch kind { - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine - - case .escaped(.wordBoundary): return .wordBoundary - case .escaped(.notWordBoundary): return .notWordBoundary - case .escaped(.startOfSubject): return .startOfSubject - case .escaped(.endOfSubject): return .endOfSubject - case .escaped(.textSegment): return .textSegment - case .escaped(.notTextSegment): return .notTextSegment - case .escaped(.endOfSubjectBeforeNewline): - return .endOfSubjectBeforeNewline - case .escaped(.firstMatchingPositionInSubject): - return .firstMatchingPositionInSubject - - case .escaped(.resetStartOfMatch): return .resetStartOfMatch - - default: return nil - } - } -} - extension AST.Atom { public enum Callout: Hashable { /// A PCRE callout written `(?C...)` @@ -806,9 +745,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions, .invalid: + case .scalarSequence, .property, .dot, .caretAnchor, + .dollarAnchor, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } @@ -858,7 +797,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .any, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -874,7 +813,7 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: return false default: return true diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 2168dbb03..4a4f5c05f 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -2073,9 +2073,9 @@ extension Parser { p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters - case ".": return customCC ? .char(".") : .any - case "^": return customCC ? .char("^") : .startOfLine - case "$": return customCC ? .char("$") : .endOfLine + case ".": return customCC ? .char(".") : .dot + case "^": return customCC ? .char("^") : .caretAnchor + case "$": return customCC ? .char("$") : .dollarAnchor // Escaped case "\\": return p.expectEscaped().value diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 0aeee282d..ea541fba7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -221,7 +221,7 @@ extension RegexValidator { ) { switch esc { case .resetStartOfMatch, .singleDataUnit, .trueAnychar, - // '\N' needs to be emitted using 'emitAny'. + // '\N' needs to be emitted using 'emitDot'. .notNewline: error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .any: + case .char, .scalar, .caretAnchor, .dollarAnchor, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 48a2512cf..cf5a56721 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,9 +153,9 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .any: return "." - case .startOfLine: return "^" - case .endOfLine: return "$" + case .dot: return "." + case .caretAnchor: return "^" + case .dollarAnchor: return "$" case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 0e7cfb1d3..6b8c8ab93 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -237,9 +237,6 @@ extension AST.Atom.Number { extension AST.Atom { var _canonicalBase: String { - if let anchor = self.assertionKind { - return anchor.rawValue - } if let lit = self.literalStringValue { // FIXME: We may have to re-introduce escapes // For example, `\.` will come back as "." instead @@ -248,6 +245,10 @@ extension AST.Atom { return lit } switch self.kind { + case .caretAnchor: + return "^" + case .dollarAnchor: + return "$" case .escaped(let e): return "\\\(e.character)" case .backreference(let br): diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..6263186e8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -58,6 +58,12 @@ fileprivate extension Compiler.ByteCodeGen { case .any: emitAny() + case .anyNonNewline: + emitAnyNonNewline() + + case .dot: + emitDot() + case let .char(c): try emitCharacter(c) @@ -65,7 +71,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitScalar(s) case let .assertion(kind): - try emitAssertion(kind.ast) + try emitAssertion(kind) case let .backreference(ref): try emitBackreference(ref.ast) @@ -110,8 +116,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitStartOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } + } + } + + mutating func emitEndOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } + } + } + mutating func emitAssertion( - _ kind: AST.Atom.AssertionKind + _ kind: DSLTree.Atom.Assertion ) throws { // FIXME: Depends on API model we have... We may want to // think through some of these with API interactions in mind @@ -168,43 +200,23 @@ fileprivate extension Compiler.ByteCodeGen { } case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. + emitStartOfLine() + + case .endOfLine: + emitEndOfLine() + + case .caretAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } + emitStartOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. + + case .dollarAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } + emitEndOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound @@ -283,22 +295,26 @@ fileprivate extension Compiler.ByteCodeGen { } mutating func emitAny() { - switch (options.semanticLevel, options.dotMatchesNewline) { - case (.graphemeCluster, true): + switch options.semanticLevel { + case .graphemeCluster: builder.buildAdvance(1) - case (.graphemeCluster, false): + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) builder.buildConsume { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) + input.unicodeScalars.index(after: bounds.lowerBound) } + } + } - case (.unicodeScalar, true): - // TODO: builder.buildAdvanceUnicodeScalar(1) + mutating func emitAnyNonNewline() { + switch options.semanticLevel { + case .graphemeCluster: builder.buildConsume { input, bounds in - input.unicodeScalars.index(after: bounds.lowerBound) + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) } - case (.unicodeScalar, false): + case .unicodeScalar: builder.buildConsume { input, bounds in input[bounds.lowerBound].isNewline ? nil @@ -307,6 +323,14 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + } else { + emitAnyNonNewline() + } + } + mutating func emitAlternation( _ children: [DSLTree.Node] ) throws { @@ -758,9 +782,9 @@ fileprivate extension Compiler.ByteCodeGen { try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): - if ccc.containsAny { + if ccc.containsDot { if !ccc.isInverted { - emitAny() + emitDot() } else { throw Unsupported("Inverted any") } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..fb9267f4f 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -123,6 +123,25 @@ extension DSLTree.Atom { } } + case .anyNonNewline: + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + case .unicodeScalar: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + + case .dot: + throw Unreachable(".atom(.dot) should be handled by emitDot") + case .assertion: // TODO: We could handle, should this be total? return nil @@ -264,12 +283,12 @@ extension AST.Atom { case let .namedCharacter(name): return consumeName(name, opts: opts) - case .any: + case .dot: assertionFailure( "Should have been handled by tree conversion") - fatalError(".atom(.any) is handled in emitAny") + fatalError(".atom(.dot) is handled in emitDot") - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..21c611d43 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -623,7 +623,7 @@ extension String { } } -extension AST.Atom.AssertionKind { +extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { @@ -631,6 +631,12 @@ extension AST.Atom.AssertionKind { return "Anchor.startOfLine" case .endOfLine: return "Anchor.endOfLine" + case .caretAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/^/" + case .dollarAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/$/" case .wordBoundary: return "Anchor.wordBoundary" case .notWordBoundary: @@ -809,7 +815,7 @@ extension AST.Atom { /// /// TODO: Some way to integrate this with conversion... var _patternBase: (String, canBeWrapped: Bool) { - if let anchor = self.assertionKind { + if let anchor = self.dslAssertionKind { return (anchor._patternBase, false) } @@ -895,10 +901,11 @@ extension AST.Atom { case .namedCharacter: return (" /* TODO: named character */", false) - case .any: - return (".any", true) + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -950,10 +957,10 @@ extension AST.Atom { case .namedCharacter(let n): return "\\N{\(n)}" - case .any: + case .dot: return "." - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -1101,6 +1108,13 @@ extension DSLTree.Atom { switch self { case .any: return (".any", true) + + case .anyNonNewline: + return (".anyNonNewline", true) + + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case let .char(c): return (String(c)._quoted, false) @@ -1117,7 +1131,7 @@ extension DSLTree.Atom { } case .assertion(let a): - return (a.ast._patternBase, false) + return (a._patternBase, false) case .backreference(_): return ("/* TOOD: backreferences */", false) @@ -1142,6 +1156,12 @@ extension DSLTree.Atom { var _regexBase: String { switch self { case .any: + return "(?s:.)" + + case .anyNonNewline: + return "(?-s:.)" + + case .dot: return "." case let .char(c): diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..2146fd61b 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -208,16 +208,44 @@ extension AST.CustomCharacterClass { } } +extension AST.Atom.EscapedBuiltin { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch self { + case .wordBoundary: return .wordBoundary + case .notWordBoundary: return .notWordBoundary + case .startOfSubject: return .startOfSubject + case .endOfSubject: return .endOfSubject + case .textSegment: return .textSegment + case .notTextSegment: return .notTextSegment + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject + case .resetStartOfMatch: return .resetStartOfMatch + default: return nil + } + } +} + +extension AST.Atom { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch kind { + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor + case .escaped(let b): return b.dslAssertionKind + default: return nil + } + } +} + extension AST.Atom { var dslTreeAtom: DSLTree.Atom { - if let kind = assertionKind { - return .assertion(.init(ast: kind)) + if let kind = dslAssertionKind { + return .assertion(kind) } switch self.kind { case let .char(c): return .char(c) case let .scalar(s): return .char(Character(s.value)) - case .any: return .any + case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 740bdcb8d..449baa6a7 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -117,11 +117,11 @@ extension DSLTree { var members: [Member] var isInverted: Bool - var containsAny: Bool { + var containsDot: Bool { members.contains { member in switch member { - case .atom(.any): return true - case .custom(let ccc): return ccc.containsAny + case .atom(.dot): return true + case .custom(let ccc): return ccc.containsDot default: return false } @@ -245,9 +245,19 @@ extension DSLTree { public enum Atom { case char(Character) case scalar(Unicode.Scalar) + + /// Any character, including newlines. case any - case assertion(_AST.AssertionKind) + /// Any character, excluding newlines. This differs from '.', as it is not + /// affected by single line mode. + case anyNonNewline + + /// The DSL representation of '.' in a regex literal. This does not match + /// newlines unless single line mode is enabled. + case dot + + case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -257,6 +267,52 @@ extension DSLTree { } } +extension DSLTree.Atom { + @_spi(RegexBuilder) + public enum Assertion: Hashable { + /// \A + case startOfSubject + + /// \Z + case endOfSubjectBeforeNewline + + /// \z + case endOfSubject + + /// \K + case resetStartOfMatch + + /// \G + case firstMatchingPositionInSubject + + /// \y + case textSegment + + /// \Y + case notTextSegment + + /// The DSL's Anchor.startOfLine, which matches the start of a line + /// even if `anchorsMatchNewlines` is false. + case startOfLine + + /// The DSL's Anchor.endOfLine, which matches the end of a line + /// even if `anchorsMatchNewlines` is false. + case endOfLine + + /// ^ + case caretAnchor + + /// $ + case dollarAnchor + + /// \b (from outside a custom character class) + case wordBoundary + + /// \B + case notWordBoundary + } +} + extension Unicode.GeneralCategory { var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { switch self { @@ -773,40 +829,6 @@ extension DSLTree { internal var ast: AST.AbsentFunction } - @_spi(RegexBuilder) - public struct AssertionKind { - internal var ast: AST.Atom.AssertionKind - - public static func startOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .startOfSubject) - } - public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubjectBeforeNewline) - } - public static func endOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubject) - } - public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { - .init(ast: .firstMatchingPositionInSubject) - } - public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notTextSegment) - : .init(ast: .textSegment) - } - public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .startOfLine) - } - public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .endOfLine) - } - public static func wordBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notWordBoundary) - : .init(ast: .wordBoundary) - } - } - @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference @@ -820,6 +842,31 @@ extension DSLTree { @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom + + // FIXME: The below APIs should be removed once the DSL tree has been + // migrated to use proper DSL atoms for them. + + public static var _anyGrapheme: Self { + .init(ast: .init(.escaped(.graphemeCluster), .fake)) + } + public static var _whitespace: Self { + .init(ast: .init(.escaped(.whitespace), .fake)) + } + public static var _digit: Self { + .init(ast: .init(.escaped(.decimalDigit), .fake)) + } + public static var _horizontalWhitespace: Self { + .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) + } + public static var _newlineSequence: Self { + .init(ast: .init(.escaped(.newlineSequence), .fake)) + } + public static var _verticalWhitespace: Self { + .init(ast: .init(.escaped(.verticalTab), .fake)) + } + public static var _word: Self { + .init(ast: .init(.escaped(.wordCharacter), .fake)) + } } } } @@ -832,7 +879,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, + .symbolicReference, .unconverted: return true } } diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 693b04966..31245c0f7 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -40,7 +40,7 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) public func assertion( - _ kind: DSLTree._AST.AssertionKind + _ kind: DSLTree.Atom.Assertion ) -> Regex { .init(node: .atom(.assertion(kind))) } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..9f515f220 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,8 +15,7 @@ // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -@_spi(RegexBuilder) -public struct _CharacterClassModel: Hashable { +struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -28,7 +27,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -50,74 +49,6 @@ public struct _CharacterClassModel: Hashable { case whitespace /// Character.isLetter or Character.isDigit or Character == "_" case word - /// One of the custom character set. - case custom([CharacterSetComponent]) - } - - public enum SetOperator: Hashable { - case subtraction - case intersection - case symmetricDifference - } - - /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { - var lhs: CharacterSetComponent - var op: SetOperator - var rhs: CharacterSetComponent - - func matches(_ c: Character, with options: MatchingOptions) -> Bool { - switch op { - case .intersection: - return lhs.matches(c, with: options) && rhs.matches(c, with: options) - case .subtraction: - return lhs.matches(c, with: options) && !rhs.matches(c, with: options) - case .symmetricDifference: - return lhs.matches(c, with: options) != rhs.matches(c, with: options) - } - } - } - - public enum CharacterSetComponent: Hashable { - case character(Character) - case range(ClosedRange) - - /// A nested character class. - case characterClass(_CharacterClassModel) - - /// A binary set operation of character class components. - indirect case setOperation(SetOperation) - - public static func setOperation( - lhs: CharacterSetComponent, op: SetOperator, rhs: CharacterSetComponent - ) -> CharacterSetComponent { - .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) - } - - func matches(_ character: Character, with options: MatchingOptions) -> Bool { - switch self { - case .character(let c): - if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() - } else { - return c == character - } - case .range(let range): - if options.isCaseInsensitive { - let newLower = range.lowerBound.lowercased() - let newUpper = range.upperBound.lowercased() - // FIXME: Is failing this possible? Is this the right behavior if so? - guard newLower <= newUpper else { return false } - return (newLower...newUpper).contains(character.lowercased()) - } else { - return range.contains(character) - } - case .characterClass(let custom): - let str = String(character) - return custom.matches(in: str, at: str.startIndex, with: options) != nil - case .setOperation(let op): return op.matches(character, with: options) - } - } } enum MatchLevel: Hashable { @@ -153,7 +84,7 @@ public struct _CharacterClassModel: Hashable { } /// Inverts a character class. - public var inverted: Self { + var inverted: Self { return withInversion(true) } @@ -188,8 +119,6 @@ public struct _CharacterClassModel: Hashable { matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -222,8 +151,6 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() @@ -233,80 +160,50 @@ public struct _CharacterClassModel: Hashable { } } -@available(SwiftStdlib 5.7, *) -extension _CharacterClassModel: RegexComponent { - public typealias RegexOutput = Substring - - public var regex: Regex { - guard let ast = self.makeAST() else { - fatalError("FIXME: extended AST?") - } - return Regex(ast: ast) - } -} - -@_spi(RegexBuilder) extension _CharacterClassModel { - public static var any: _CharacterClassModel { + static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: _CharacterClassModel { + static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var anyUnicodeScalar: _CharacterClassModel { + static var anyUnicodeScalar: _CharacterClassModel { .init(cc: .any, matchLevel: .unicodeScalar) } - public static var whitespace: _CharacterClassModel { + static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: _CharacterClassModel { + static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: _CharacterClassModel { + static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: _CharacterClassModel { + static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: _CharacterClassModel { + static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: _CharacterClassModel { + static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: _CharacterClassModel { + static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } - - public static func custom( - _ components: [_CharacterClassModel.CharacterSetComponent] - ) -> _CharacterClassModel { - .init(cc: .custom(components), matchLevel: .graphemeCluster) - } -} - -extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { - public var description: String { - switch self { - case .range(let range): return "" - case .character(let character): return "" - case .characterClass(let custom): return "\(custom)" - case .setOperation(let op): return "<\(op.lhs) \(op.op) \(op.rhs)>" - } - } } extension _CharacterClassModel.Representation: CustomStringConvertible { - public var description: String { + var description: String { switch self { case .any: return "" case .anyGrapheme: return "" @@ -318,95 +215,16 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .verticalWhitespace: return "vertical whitespace" case .whitespace: return "" case .word: return "" - case .custom(let set): return "" } } } extension _CharacterClassModel: CustomStringConvertible { - public var description: String { + var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension _CharacterClassModel { - public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch makeAST() { - case .atom(let atom): - return .init(members: [.atom(.unconverted(.init(ast: atom)))]) - default: - return nil - } - } - - internal func makeAST() -> AST.Node? { - let inv = isInverted - - func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { - escaped(b) - } - - switch cc { - case .any: return atom(.any) - - case .digit: - return esc(inv ? .notDecimalDigit : .decimalDigit) - - case .horizontalWhitespace: - return esc( - inv ? .notHorizontalWhitespace : .horizontalWhitespace) - - // FIXME: newline sequence is not same as \n - case .newlineSequence: - return esc(inv ? .notNewline : .newline) - - case .whitespace: - return esc(inv ? .notWhitespace : .whitespace) - - case .verticalWhitespace: - return esc(inv ? .notVerticalTab : .verticalTab) - - case .word: - return esc(inv ? .notWordCharacter : .wordCharacter) - - case .anyGrapheme: - return esc(.graphemeCluster) - - case .hexDigit: - let members: [AST.CustomCharacterClass.Member] = [ - range_m(.char("a"), .char("f")), - range_m(.char("A"), .char("F")), - range_m(.char("0"), .char("9")), - ] - let ccc = AST.CustomCharacterClass( - .init(faking: inv ? .inverted : .normal), - members, - .fake) - - return .customCharacterClass(ccc) - - default: return nil - } - } -} - -extension DSLTree.Node { - var characterClass: _CharacterClassModel? { - switch self { - case let .customCharacterClass(ccc): - return ccc.modelCharacterClass - case let .atom(a): - return a.characterClass - case .characterPredicate: - // FIXME: Do we make one from this? - return nil - default: - return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel @@ -417,17 +235,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Atom { - var characterClass: _CharacterClassModel? { - switch self { - case let .unconverted(a): - return a.ast.characterClass - - default: return nil - } - } -} - extension AST.Atom { var characterClass: _CharacterClassModel? { switch kind { @@ -438,8 +245,8 @@ extension AST.Atom { // this? Or does grapheme-semantic mode complicate that? return nil - case .any: - // `.any` is handled in the matching engine by Compiler.emitAny() and in + case .dot: + // `.dot` is handled in the matching engine by Compiler.emitDot() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure // @@ -468,7 +275,7 @@ extension AST.Atom.EscapedBuiltin { // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through - // emitAny(). For now we treat it as semantically invalid. + // emitDot(). For now we treat it as semantically invalid. case .notNewline: return .newlineSequence.inverted case .whitespace: return .whitespace @@ -489,81 +296,6 @@ extension AST.Atom.EscapedBuiltin { } } -extension DSLTree.CustomCharacterClass { - // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: _CharacterClassModel? { - var result = - Array<_CharacterClassModel.CharacterSetComponent>() - for m in members { - switch m { - case let .atom(a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let c = a.literalCharacterValue { - result.append(.character(c)) - } else { - return nil - } - case let .range(low, high): - guard let lhs = low.literalCharacterValue, - let rhs = high.literalCharacterValue - else { - return nil - } - result.append(.range(lhs...rhs)) - - case let .custom(ccc): - guard let cc = ccc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - - case let .intersection(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .intersection, - rhs: .characterClass(rhs))) - - case let .subtraction(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .subtraction, - rhs: .characterClass(rhs))) - - case let .symmetricDifference(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .symmetricDifference, - rhs: .characterClass(rhs))) - - case let .quotedLiteral(s): - // Decompose quoted literal into literal characters. - result += s.map { .character($0) } - - case .trivia: - break - } - } - let cc = _CharacterClassModel.custom(result) - return isInverted ? cc.inverted : cc - } -} - extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..1cf039b35 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -69,6 +69,9 @@ class RegexDSLTests: XCTestCase { XCTAssertTrue(match.output == substringMatch.output) } + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + func testCharacterClasses() throws { try _testDSLCaptures( ("a c", ("a c", " ", "c")), @@ -110,6 +113,137 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + // `.newlineSequence` and `.verticalWhitespace` match the same set of + // newlines in grapheme semantic mode, and scalar mode when applied with + // OneOrMore. + for cc in [CharacterClass.newlineSequence, .verticalWhitespace] { + for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], allNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode) + } + + // Try with ASCII-only whitespace. + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], asciiNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode).asciiOnlyWhitespace() + } + } + } + + // `.newlineSequence` in scalar mode may match a single `\r\n`. + // `.verticalWhitespace` may not. + for asciiOnly in [true, false] { + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", "\r\n"), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + "\n" + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + } + + // Make sure horizontal whitespace does not match newlines or other + // vertical whitespace. + try _testDSLCaptures( + (" \u{A0} \u{9} \t ", " \u{A0} \u{9} \t "), + (" \n", nil), + (" \r", nil), + (" \r\n", nil), + (" \u{2028}", nil), + matchType: Substring.self, ==) + { + OneOrMore(.horizontalWhitespace) + } + + // Horizontal whitespace in ASCII mode. + try _testDSLCaptures( + (" \u{9} \t ", " \u{9} \t "), + ("\u{A0}", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.horizontalWhitespace) + }.asciiOnlyWhitespace() + } } func testCharacterClassOperations() throws { @@ -133,6 +267,105 @@ class RegexDSLTests: XCTestCase { } } + func testAny() throws { + // .any matches newlines regardless of matching options. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.any) + }.dotMatchesNewlines(dotMatchesNewline) + } + } + + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + try _testDSLCaptures( + ("a", "a"), + ("\r\n", "\r\n"), + ("e\u{301}", "e\u{301}"), + ("e\u{301}f", nil), + ("e\u{303}\u{301}\u{302}", "e\u{303}\u{301}\u{302}"), + matchType: Substring.self, ==) + { + Regex { + One(.anyGraphemeCluster) + }.matchingSemantics(mode) + } + + // Like `.any` it also always matches newlines. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyGraphemeCluster) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + } + + func testAnyNonNewline() throws { + // `.anyNonNewline` is `.` without single-line mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abcdef", "abcdef"), + ("abcdef\n", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abcdef", nil), + ("abcdef\n", nil), + ("\r\n", "\r\n"), + ("\r", "\r"), + ("\n", "\n"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline.inverted) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abc", "abc"), + ("abcd", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(CharacterClass.anyNonNewline.intersection(.anyOf("\n\rabc"))) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + + try _testDSLCaptures( + ("\r\n", "\r\n"), matchType: Substring.self, ==) { + CharacterClass.anyNonNewline.inverted + } + try _testDSLCaptures( + ("\r\n", nil), matchType: Substring.self, ==) { + Regex { + CharacterClass.anyNonNewline.inverted + }.matchingSemantics(.unicodeScalar) + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") @@ -674,19 +907,40 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - - // FIXME: Anchor.start/endOfLine needs to always match line endings, - // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( - ("\naaa", "aaa"), - ("aaa\n", "aaa"), - ("\naaa\n", "aaa"), - matchType: Substring.self, ==, xfail: true) + ("\naaa", "\naaa"), + ("aaa\n", "aaa\n"), + ("\naaa\n", "\naaa\n"), + matchType: Substring.self, ==) { Regex { + Optionally { "\n" } Anchor.startOfLine Repeat("a", count: 3) Anchor.endOfLine + Optionally { "\n" } + } + } + + // startOfLine/endOfLine apply regardless of mode. + for matchLineEndings in [true, false] { + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + let r = Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) + + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) + XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) + + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) + XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d375065ab..8f7baf4b9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -24,9 +24,10 @@ func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, + semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?]) { - var regex = try Regex(regexStr, syntax: syntax) + var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) guard let result = try regex.firstMatch(in: input) else { throw MatchError("match not found for \(regexStr) in \(input)") } @@ -54,6 +55,7 @@ func flatCaptureTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -63,6 +65,7 @@ func flatCaptureTest( regex, input: test, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax ) else { if expect == nil { @@ -113,6 +116,7 @@ func matchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -126,6 +130,7 @@ func matchTest( dumpAST: dumpAST, xfail: xfail, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, file: file, line: line) } @@ -143,6 +148,7 @@ func firstMatchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -151,6 +157,7 @@ func firstMatchTest( regex, input: input, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax) if xfail { @@ -627,6 +634,49 @@ extension RegexTests { ("\n", true), ("\r", true)) + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + firstMatchTest( + #"\R+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + firstMatchTest( + #"\v+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + } + + // In scalar mode, \R can match \r\n, \v cannot. + firstMatchTest( + #"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar) + + // ASCII-only spaces. + firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest( + #"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 3c43f27af..52a272915 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -359,14 +359,14 @@ extension RegexTests { parseTest( "(.)*(.*)", concat( - zeroOrMore(of: capture(atom(.any))), - capture(zeroOrMore(of: atom(.any)))), + zeroOrMore(of: capture(atom(.dot))), + capture(zeroOrMore(of: atom(.dot)))), captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( - zeroOrMore(of: capture(capture(atom(.any)))), - capture(zeroOrOne(of: capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.dot)))), + capture(zeroOrOne(of: capture(atom(.dot))))), captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, @@ -479,7 +479,7 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) - // FIXME: '\N' should be emitted through 'emitAny', not through the + // FIXME: '\N' should be emitted through 'emitDot', not through the // _CharacterClassModel model. parseTest(#"\N"#, escaped(.notNewline), unsupported: true) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..e33b10c31 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -68,7 +68,38 @@ extension RenderDSLTests { } """) } - + + func testDot() throws { + try testConversion(#".+"#, #""" + Regex { + OneOrMore { + /./ + } + } + """#) + try testConversion(#"a.c"#, #""" + Regex { + "a" + /./ + "c" + } + """#) + } + + func testAnchor() throws { + try testConversion(#"^(?:a|b|c)$"#, #""" + Regex { + /^/ + ChoiceOf { + "a" + "b" + "c" + } + /$/ + } + """#) + } + func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { try testConversion(#"(?i)abc"#, """