diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d03a1e382..d6d6c3c5e 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -111,30 +111,41 @@ extension Compiler.ByteCodeGen { } case .startOfLine: - builder.buildAssert { (input, pos, bounds) in - pos == input.startIndex || - input[input.index(before: pos)].isNewline + if options.anchorsMatchNewlines { + builder.buildAssert { (input, pos, bounds) in + pos == input.startIndex || input[input.index(before: pos)].isNewline + } + } else { + builder.buildAssert { (input, pos, bounds) in + pos == input.startIndex + } } - + case .endOfLine: - builder.buildAssert { (input, pos, bounds) in - pos == input.endIndex || input[pos].isNewline + if options.anchorsMatchNewlines { + builder.buildAssert { (input, pos, bounds) in + pos == input.endIndex || input[pos].isNewline + } + } else { + builder.buildAssert { (input, pos, bounds) in + pos == input.endIndex + } } case .wordBoundary: // TODO: May want to consider Unicode level - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [options] (input, pos, bounds) in // TODO: How should we handle bounds? CharacterClass.word.isBoundary( - input, at: pos, bounds: bounds) + input, at: pos, bounds: bounds, with: options) } case .notWordBoundary: // TODO: May want to consider Unicode level - builder.buildAssert { (input, pos, bounds) in + builder.buildAssert { [options] (input, pos, bounds) in // TODO: How should we handle bounds? !CharacterClass.word.isBoundary( - input, at: pos, bounds: bounds) + input, at: pos, bounds: bounds, with: options) } } } diff --git a/Sources/_StringProcessing/CharacterClass.swift b/Sources/_StringProcessing/CharacterClass.swift index d44fa9fb2..bdf34d0a7 100644 --- a/Sources/_StringProcessing/CharacterClass.swift +++ b/Sources/_StringProcessing/CharacterClass.swift @@ -59,14 +59,14 @@ public struct CharacterClass: Hashable { var op: SetOperator var rhs: CharacterSetComponent - public func matches(_ c: Character) -> Bool { + public func matches(_ c: Character, with options: MatchingOptions) -> Bool { switch op { case .intersection: - return lhs.matches(c) && rhs.matches(c) + return lhs.matches(c, with: options) && rhs.matches(c, with: options) case .subtraction: - return lhs.matches(c) && !rhs.matches(c) + return lhs.matches(c, with: options) && !rhs.matches(c, with: options) case .symmetricDifference: - return lhs.matches(c) != rhs.matches(c) + return lhs.matches(c, with: options) != rhs.matches(c, with: options) } } } @@ -87,14 +87,28 @@ public struct CharacterClass: Hashable { .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) } - public func matches(_ character: Character) -> Bool { + public func matches(_ character: Character, with options: MatchingOptions) -> Bool { switch self { - case .character(let c): return c == character - case .range(let range): return range.contains(character) + case .character(let c): + if options.isCaseInsensitive { + return c.lowercased() == character.lowercased() + } else { + return c == character + } + case .range(let range): + if options.isCaseInsensitive { + let newLower = range.lowerBound.lowercased() + let newUpper = range.upperBound.lowercased() + // FIXME: Is failing this possible? Is this the right behavior if so? + guard newLower <= newUpper else { return false } + return (newLower...newUpper).contains(character.lowercased()) + } else { + return range.contains(character) + } case .characterClass(let custom): let str = String(character) - return custom.matches(in: str, at: str.startIndex) != nil - case .setOperation(let op): return op.matches(character) + return custom.matches(in: str, at: str.startIndex, with: options) != nil + case .setOperation(let op): return op.matches(character, with: options) } } } @@ -135,21 +149,26 @@ public struct CharacterClass: Hashable { /// Returns the end of the match of this character class in `str`, if /// it matches. - public func matches(in str: String, at i: String.Index) -> String.Index? { + public func matches(in str: String, at i: String.Index, with options: MatchingOptions) -> String.Index? { switch matchLevel { case .graphemeCluster: let c = str[i] var matched: Bool switch cc { case .any, .anyGrapheme: matched = true - case .digit: matched = c.isNumber - case .hexDigit: matched = c.isHexDigit + case .digit: + matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) + case .hexDigit: + matched = c.isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: fatalError("Not implemented") - case .newlineSequence: matched = c.isNewline + case .newlineSequence: + matched = c.isNewline && (c.isASCII || !options.usesASCIISpaces) case .verticalWhitespace: fatalError("Not implemented") - case .whitespace: matched = c.isWhitespace - case .word: matched = c.isWordCharacter - case .custom(let set): matched = set.any { $0.matches(c) } + case .whitespace: + matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .word: + matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) + case .custom(let set): matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -161,13 +180,17 @@ public struct CharacterClass: Hashable { switch cc { case .any: matched = true case .anyGrapheme: fatalError("Not matched in this mode") - case .digit: matched = c.properties.numericType != nil - case .hexDigit: matched = Character(c).isHexDigit + case .digit: + matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) + case .hexDigit: + matched = Character(c).isHexDigit && (c.isASCII || !options.usesASCIIDigits) case .horizontalWhitespace: fatalError("Not implemented") case .newlineSequence: fatalError("Not implemented") case .verticalWhitespace: fatalError("Not implemented") - case .whitespace: matched = c.properties.isWhitespace - case .word: matched = c.properties.isAlphabetic || c == "_" + case .whitespace: + matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) + case .word: + matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) case .custom: fatalError("Not supported") } if isInverted { @@ -495,21 +518,22 @@ extension CharacterClass { func isBoundary( _ input: String, at pos: String.Index, - bounds: Range + bounds: Range, + with options: MatchingOptions ) -> Bool { // FIXME: How should we handle bounds? // We probably need two concepts if input.isEmpty { return false } if pos == input.startIndex { - return self.matches(in: input, at: pos) != nil + return self.matches(in: input, at: pos, with: options) != nil } let priorIdx = input.index(before: pos) if pos == input.endIndex { - return self.matches(in: input, at: priorIdx) != nil + return self.matches(in: input, at: priorIdx, with: options) != nil } - let prior = self.matches(in: input, at: priorIdx) != nil - let current = self.matches(in: input, at: pos) != nil + let prior = self.matches(in: input, at: priorIdx, with: options) != nil + let current = self.matches(in: input, at: pos, with: options) != nil return prior != current } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 0a2d93ff1..987cbea96 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -136,7 +136,7 @@ extension AST.Atom { ) { return { input, bounds in // FIXME: should we worry about out of bounds? - cc.matches(in: input, at: bounds.lowerBound) + cc.matches(in: input, at: bounds.lowerBound, with: opts) } } diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 899891184..c4b2b8de7 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -13,7 +13,7 @@ import _RegexParser /// A type that represents the current state of regex matching options, with /// stack-based scoping. -struct MatchingOptions { +public struct MatchingOptions { fileprivate var stack: [Representation] fileprivate func _invariantCheck() { @@ -67,11 +67,32 @@ extension MatchingOptions { stack.last!.contains(.singleLine) } + var anchorsMatchNewlines: Bool { + stack.last!.contains(.multiline) + } + + var usesASCIIWord: Bool { + stack.last!.contains(.asciiOnlyWord) + || stack.last!.contains(.asciiOnlyPOSIXProps) + } + + var usesASCIIDigits: Bool { + stack.last!.contains(.asciiOnlyDigit) + || stack.last!.contains(.asciiOnlyPOSIXProps) + } + + var usesASCIISpaces: Bool { + stack.last!.contains(.asciiOnlySpace) + || stack.last!.contains(.asciiOnlyPOSIXProps) + } + + var usesSimpleUnicodeBoundaries: Bool { + !stack.last!.contains(.unicodeWordBoundaries) + } + enum SemanticLevel { case graphemeCluster case unicodeScalar - // TODO: include? - // case byte } var semanticLevel: SemanticLevel { diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 04be79c6e..38fba02d6 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -12,17 +12,163 @@ import _RegexParser extension RegexComponent { - public func caseSensitive(_ isCaseSensitive: Bool) -> Regex { - // The API is "case sensitive = true or false", so as to avoid the - // double negatives inherent in setting "case insensitive" to a Boolean - // value. The internal version of this option, on the other hand, is - // `.caseInsensitive`, derived from the `(?i)` regex literal option. - let sequence = isCaseSensitive - ? AST.MatchingOptionSequence(removing: [.init(.caseInsensitive, location: .fake)]) - : AST.MatchingOptionSequence(adding: [.init(.caseInsensitive, location: .fake)]) + /// Returns a regular expression that ignores casing when matching. + public func ignoringCase(_ ignoreCase: Bool = true) -> Regex { + wrapInOption(.caseInsensitive, addingIf: ignoreCase) + } + + /// Returns a regular expression that only matches ASCII characters as "word + /// characters". + public func usingASCIIWordCharacters(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlyDigit, addingIf: useASCII) + } + + /// Returns a regular expression that only matches ASCII characters as digits. + public func usingASCIIDigits(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlyDigit, addingIf: useASCII) + } + + /// Returns a regular expression that only matches ASCII characters as space + /// characters. + public func usingASCIISpaces(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlySpace, addingIf: useASCII) + } + + /// Returns a regular expression that only matches ASCII characters when + /// matching character classes. + public func usingASCIICharacterClasses(_ useASCII: Bool = true) -> Regex { + wrapInOption(.asciiOnlyPOSIXProps, addingIf: useASCII) + } + + /// Returns a regular expression that uses the Unicode word boundary + /// algorithm. + /// + /// This option is enabled by default; pass `false` to disable use of + /// Unicode's word boundary algorithm. + public func usingUnicodeWordBoundaries(_ useUnicodeWordBoundaries: Bool = true) -> Regex { + wrapInOption(.unicodeWordBoundaries, addingIf: useUnicodeWordBoundaries) + } + + /// Returns a regular expression where the start and end of input + /// anchors (`^` and `$`) also match against the start and end of a line. + /// + /// - Parameter dotMatchesNewlines: A Boolean value indicating whether `.` + /// should match a newline character. + public func dotMatchesNewlines(_ dotMatchesNewlines: Bool = true) -> Regex { + wrapInOption(.singleLine, addingIf: dotMatchesNewlines) + } + + /// Returns a regular expression that matches with the specified semantic + /// level. + /// + /// When matching with grapheme cluster semantics (the default), + /// metacharacters like `.` and `\w`, custom character classes, and character + /// class instances like `.any` match a grapheme cluster when possible, + /// corresponding with the default string representation. In addition, + /// matching with grapheme cluster semantics compares characters using their + /// canonical representation, corresponding with how strings comparison works. + /// + /// When matching with Unicode scalar semantics, metacharacters and character + /// classes always match a single Unicode scalar value, even if that scalar + /// comprises part of a grapheme cluster. + /// + /// These semantic levels can lead to different results, especially when + /// working with strings that have decomposed characters. In the following + /// example, `queRegex` matches any 3-character string that begins with `"q"`. + /// + /// let composed = "qué" + /// let decomposed = "que\u{301}" + /// + /// let queRegex = /^q..$/ + /// + /// print(composed.contains(queRegex)) + /// // Prints "true" + /// print(decomposed.contains(queRegex)) + /// // Prints "true" + /// + /// When using Unicode scalar semantics, however, the regular expression only + /// matches the composed version of the string, because each `.` matches a + /// single Unicode scalar value. + /// + /// let queRegexScalar = queRegex.matchingSemantics(.unicodeScalar) + /// print(composed.contains(queRegexScalar)) + /// // Prints "true" + /// print(decomposed.contains(queRegexScalar)) + /// // Prints "false" + public func matchingSemantics(_ semanticLevel: RegexSemanticLevel) -> Regex { + switch semanticLevel.base { + case .graphemeCluster: + return wrapInOption(.graphemeClusterSemantics, addingIf: true) + case .unicodeScalar: + return wrapInOption(.unicodeScalarSemantics, addingIf: true) + } + } +} + +public struct RegexSemanticLevel: Hashable { + internal enum Representation { + case graphemeCluster + case unicodeScalar + } + + internal var base: Representation + + /// Match at the default semantic level of a string, where each matched + /// element is a `Character`. + public static var graphemeCluster: RegexSemanticLevel { + .init(base: .graphemeCluster) + } + + /// Match at the semantic level of a string's `UnicodeScalarView`, where each + /// matched element is a `UnicodeScalar` value. + public static var unicodeScalar: RegexSemanticLevel { + .init(base: .unicodeScalar) + } +} + +// Options that only affect literals +extension RegexComponent { + /// Returns a regular expression where the start and end of input + /// anchors (`^` and `$`) also match against the start and end of a line. + /// + /// This method corresponds to applying the `m` option in a regular + /// expression literal, and only applies to regular expressions specified as + /// literals. For this behavior in the `RegexBuilder` syntax, see + /// ``Anchor.startOfLine``, ``Anchor.endOfLine``, ``Anchor.startOfInput``, + /// and ``Anchor.endOfInput``. + /// + /// - Parameter matchLineEndings: A Boolean value indicating whether `^` and + /// `$` should match the start and end of lines, respectively. + public func anchorsMatchLineEndings(_ matchLineEndings: Bool = true) -> Regex { + wrapInOption(.multiline, addingIf: matchLineEndings) + } + + /// Returns a regular expression where quantifiers are reluctant by default + /// instead of eager. + /// + /// This method corresponds to applying the `U` option in a regular + /// expression literal, and only applies to regular expressions specified as + /// literals. In the `RegexBuilder` syntax, pass a ``QuantificationBehavior`` + /// value to any quantification method to change its behavior. + /// + /// - Parameter useReluctantCaptures: A Boolean value indicating whether + /// quantifiers should be reluctant by default. + public func reluctantCaptures(_ useReluctantCaptures: Bool = true) -> Regex { + wrapInOption(.reluctantByDefault, addingIf: useReluctantCaptures) + } +} + +// MARK: - Helper method +extension RegexComponent { + fileprivate func wrapInOption( + _ option: AST.MatchingOption.Kind, + addingIf shouldAdd: Bool) -> Regex + { + let sequence = shouldAdd + ? AST.MatchingOptionSequence(adding: [.init(option, location: .fake)]) + : AST.MatchingOptionSequence(removing: [.init(option, location: .fake)]) return Regex(node: .nonCapturingGroup( .changeMatchingOptions(sequence, isIsolated: false), regex.root)) } } - diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 50358734d..a414e7938 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -175,7 +175,7 @@ class RegexDSLTests: XCTestCase { matchType: Substring.self, ==) { OneOrMore { "abc" - }.caseSensitive(false) + }.ignoringCase(true) } // Multiple options on one component wrap successively, but do not @@ -189,8 +189,8 @@ class RegexDSLTests: XCTestCase { OneOrMore { "abc" } - .caseSensitive(false) - .caseSensitive(true) + .ignoringCase(true) + .ignoringCase(false) } // An option on an outer component doesn't override an option set on an @@ -204,10 +204,10 @@ class RegexDSLTests: XCTestCase { ("abcdeABCdeaBcde", "abcdeABCdeaBcde"), matchType: Substring.self, ==) { OneOrMore { - "abc".caseSensitive(false) + "abc".ignoringCase(true) Optionally("de") } - .caseSensitive(true) + .ignoringCase(false) } } @@ -216,32 +216,44 @@ class RegexDSLTests: XCTestCase { ("abc1def2", ("abc1def2", "2")), matchType: (Substring, Substring).self, ==) { - OneOrMore { - OneOrMore(.word) - Capture(.digit) - } + OneOrMore(.word) + Capture(.digit) + ZeroOrMore(.any) } try _testDSLCaptures( - ("abc1def2", ("abc1def2", "2")), + ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore { - OneOrMore(.word, .reluctantly) - Capture(.digit) - } + OneOrMore(.word, .reluctantly) + Capture(.digit) + ZeroOrMore(.any) } - + +#if os(macOS) + try XCTExpectFailure("'relucantCaptures()' API should only affect regex literals") { + try _testDSLCaptures( + ("abc1def2", ("abc1def2", "2")), + matchType: (Substring, Substring).self, ==) + { + Regex { + OneOrMore(.word) + Capture(.digit) + ZeroOrMore(.any) + }.reluctantCaptures() + } + } +#endif + try _testDSLCaptures( - ("abc1def2", ("abc1def2", "2")), + ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore { - OneOrMore(.reluctantly) { - .word - } - Capture(.digit) + OneOrMore(.reluctantly) { + .word } + Capture(.digit) + ZeroOrMore(.any) } try _testDSLCaptures( diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 67412d262..494af4c7c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -869,15 +869,15 @@ extension RegexTests { ("123", "123"), (" 123", nil), ("123 456", "123"), - (" 123 \n456", "456"), - (" \n123 \n456", "123")) + (" 123 \n456", nil), + (" \n123 \n456", nil)) firstMatchTests( #"\d+$"#, ("123", "123"), (" 123", "123"), (" 123 \n456", "456"), - (" 123\n456", "123"), + (" 123\n456", "456"), ("123 456", "456")) firstMatchTests( @@ -1197,6 +1197,89 @@ extension RegexTests { ("cafe", true), ("CaFe", true), ("EfAc", true)) + matchTest( + #"(?i)[a-f]{4}"#, + ("cafe", true), + ("CaFe", true), + ("EfAc", true)) + } + + func testASCIIClasses() { + // 'D' ASCII-only digits + matchTest( + #"\d+"#, + ("123", true), + ("¹೨¾", true)) + matchTest( + #"(?D)\d+"#, + ("123", true), + ("¹೨¾", false)) + matchTest( + #"(?P)\d+"#, + ("123", true), + ("¹೨¾", false)) + + // 'W' ASCII-only word characters (and word boundaries) + matchTest( + #"\w+"#, + ("aeiou", true), + ("åe\u{301}ïôú", true)) + matchTest( + #"(?W)\w+"#, + ("aeiou", true), + ("åe\u{301}ïôú", false)) + matchTest( + #"(?P)\w+"#, + ("aeiou", true), + ("åe\u{301}ïôú", false)) + + matchTest( + #"abcd\b.+"#, + ("abcd ef", true), + ("abcdef", false), + ("abcdéf", false)) + matchTest( + #"(?W)abcd\b.+"#, + ("abcd ef", true), + ("abcdef", false), + ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII + matchTest( + #"(?P)abcd\b.+"#, + ("abcd ef", true), + ("abcdef", false), + ("abcdéf", true)) // "dé" matches /d\b./ because "é" isn't ASCII + + // 'S' ASCII-only spaces + matchTest( + #"a\sb"#, + ("a\tb", true), + ("a\u{202f}b", true)) // NARROW NO-BREAK SPACE + matchTest( + #"(?S)a\sb"#, + ("a\tb", true), + ("a\u{202f}b", false)) + matchTest( + #"(?P)a\sb"#, + ("a\tb", true), + ("a\u{202f}b", false)) + } + + func testAnchorMatching() throws { + let string = """ + 01: Alabama + 02: Alaska + 03: Arizona + 04: Arkansas + 05: California + """ + XCTAssertTrue(string.contains(try Regex(compiling: #"^\d+"#))) + XCTAssertEqual(string.ranges(of: try Regex(compiling: #"^\d+"#)).count, 1) + XCTAssertEqual(string.ranges(of: try Regex(compiling: #"(?m)^\d+"#)).count, 5) + + let regex = try Regex(compiling: #"^\d+: [\w ]+$"#) + XCTAssertFalse(string.contains(regex)) + let allRanges = string.ranges(of: regex.anchorsMatchLineEndings()) + XCTAssertEqual(allRanges.count, 5) } func testMatchingOptionsScope() { @@ -1220,6 +1303,16 @@ extension RegexTests { firstMatchTest(#"(?s)((?-s)((?i)a)).b"#, input: "a\nb", match: "a\nb") } + func testOptionMethods() throws { + let regex = try Regex(compiling: "c.f.") + XCTAssertTrue ("cafe".contains(regex)) + XCTAssertFalse("CaFe".contains(regex)) + + let caseInsensitiveRegex = regex.ignoringCase() + XCTAssertTrue("cafe".contains(caseInsensitiveRegex)) + XCTAssertTrue("CaFe".contains(caseInsensitiveRegex)) + } + // MARK: Character Semantics var eComposed: String { "é" }