From cee8683cdb2daf3fe51144666e4c838d2a3d58f5 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 17 Feb 2022 09:33:38 -0600 Subject: [PATCH 1/3] Add support for case insensitive matching --- Sources/_StringProcessing/ByteCodeGen.swift | 59 ++++++++++++++++--- .../_StringProcessing/MatchingOptions.swift | 9 ++- Tests/RegexTests/MatchTests.swift | 25 ++++++++ 3 files changed, 83 insertions(+), 10 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index e09616363..f9d6a4b18 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -20,15 +20,11 @@ extension Compiler.ByteCodeGen { emitAny() case let .char(c): - // FIXME: Does semantic level matter? - builder.buildMatch(c) - + try emitCharacter(c) + case let .scalar(s): - // TODO: Native instruction - builder.buildConsume(by: consumeScalar { - $0 == s - }) - + try emitScalar(s) + case let .assertion(kind): try emitAssertion(kind) @@ -135,6 +131,36 @@ extension Compiler.ByteCodeGen { } } } + + mutating func emitScalar(_ s: UnicodeScalar) throws { + // TODO: Native instruction buildMatchScalar(s) + if options.isCaseSensitive { + builder.buildConsume(by: consumeScalar { + $0 == s + }) + } else { + // TODO: e.g. buildCaseInsensitiveMatchScalar(s) + builder.buildConsume(by: consumeScalar { + $0.properties.lowercaseMapping == s.properties.lowercaseMapping + }) + } + } + + mutating func emitCharacter(_ c: Character) throws { + // FIXME: Does semantic level matter? + if options.isCaseSensitive || !c.isCased { + builder.buildMatch(c) + } else { + // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) + builder.buildConsume { input, bounds in + let inputChar = input[bounds.lowerBound].lowercased() + let matchChar = c.lowercased() + return inputChar == matchChar + ? input.index(after: bounds.lowerBound) + : nil + } + } + } mutating func emitAny() { switch (options.semanticLevel, options.dotMatchesNewline) { @@ -513,7 +539,22 @@ extension Compiler.ByteCodeGen { case let .quotedLiteral(s): // TODO: Should this incorporate options? - builder.buildMatchSequence(s) + if options.isCaseSensitive { + builder.buildMatchSequence(s) + } else { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } case let .regexLiteral(l): try emitNode(l.dslTreeNode) diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index dcdddc4a8..37a45df2e 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -25,7 +25,7 @@ struct MatchingOptions { } } -// Compiler API +// MARK: Compilation API extension MatchingOptions { /// Creates an instance with the default options. init() { @@ -51,7 +51,14 @@ extension MatchingOptions { stack[stack.count - 1].apply(sequence) _invariantCheck() } +} +// MARK: Matching behavior API +extension MatchingOptions { + var isCaseSensitive: Bool { + !stack.last!.contains(.caseInsensitive) + } + var isReluctantByDefault: Bool { stack.last!.contains(.reluctantByDefault) } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 088deb151..9f74ac6d3 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1117,6 +1117,31 @@ extension RegexTests { firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + func testCaseSensitivity() { + matchTest( + #"c..e"#, + ("cafe", true), + ("Cafe", false)) + matchTest( + #"(?i)c.f."#, + ("cafe", true), + ("Cafe", true), + ("caFe", true)) + matchTest( + #"(?i)cafe"#, + ("cafe", true), + ("Cafe", true), + ("caFe", true)) + matchTest( + #"(?i)café"#, + ("café", true), + ("CafÉ", true)) + matchTest( + #"(?i)\u{63}af\u{e9}"#, + ("café", true), + ("CafÉ", true)) + } + func testMatchingOptionsScope() { // `.` only matches newlines when the 's' option (single-line mode) // is turned on. Standalone option-setting groups (e.g. `(?s)`) are From 5391afc8339067d53951b9f1cd104fb47985f0fb Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 17 Feb 2022 09:57:53 -0600 Subject: [PATCH 2/3] Plumb case insensitivity through custom CCs --- .../_StringProcessing/ConsumerInterface.swift | 71 ++++++++++++++----- Tests/RegexTests/MatchTests.swift | 20 +++++- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b6cdbba5b..90c8d127c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -51,24 +51,34 @@ extension DSLTree.Node { } extension DSLTree.Atom { - // TODO: If ByteCodeGen switches first, then this is - // unnecessary... + // TODO: If ByteCodeGen switches first, then this is unnecessary for + // top-level nodes, but it's also invoked for `.atom` members of a custom CC func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { + let isCaseSensitive = opts.isCaseSensitive + switch self { - case let .char(c): // TODO: Match level? return { input, bounds in let low = bounds.lowerBound - guard input[low] == c else { - return nil + if isCaseSensitive || !c.isCased { + return input[low] == c + ? input.index(after: low) + : nil + } else { + return input[low].lowercased() == c.lowercased() + ? input.index(after: low) + : nil } - return input.index(after: low) } case let .scalar(s): - return consumeScalar { $0 == s } + return consumeScalar { + isCaseSensitive + ? $0 == s + : $0.properties.lowercaseMapping == s.properties.lowercaseMapping + } case .any: // FIXME: Should this be a total ordering? @@ -187,14 +197,30 @@ extension DSLTree.CustomCharacterClass.Member { throw Unsupported("\(high) in range") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + if opts.isCaseSensitive { + guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + return { input, bounds in + // TODO: check for out of bounds? + let curIdx = bounds.lowerBound + if (lhs...rhs).contains(input[curIdx]) { + // TODO: semantic level + return input.index(after: curIdx) + } + return nil + } + } else { + let lhsLower = lhs.lowercased() + let rhsLower = rhs.lowercased() + guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + return { input, bounds in + // TODO: check for out of bounds? + let curIdx = bounds.lowerBound + if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { + // TODO: semantic level + return input.index(after: curIdx) + } + return nil } - return nil } case let .custom(ccc): @@ -237,11 +263,20 @@ extension DSLTree.CustomCharacterClass.Member { return rhs(input, bounds) } case .quotedLiteral(let s): - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + if opts.isCaseSensitive { + return { input, bounds in + guard s.contains(input[bounds.lowerBound]) else { + return nil + } + return input.index(after: bounds.lowerBound) + } + } else { + return { input, bounds in + guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else { + return nil + } + return input.index(after: bounds.lowerBound) } - return input.index(after: bounds.lowerBound) } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 9f74ac6d3..49e34d50f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -113,7 +113,9 @@ func matchTest( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #file, + line: UInt = #line ) { for (test, expect) in tests { firstMatchTest( @@ -123,7 +125,9 @@ func matchTest( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -1140,6 +1144,18 @@ extension RegexTests { #"(?i)\u{63}af\u{e9}"#, ("café", true), ("CafÉ", true)) + + matchTest( + #"[caFE]{4}"#, + ("cafe", false), + ("CAFE", false), + ("caFE", true), + ("EFac", true)) + matchTest( + #"(?i)[caFE]{4}"#, + ("cafe", true), + ("CaFe", true), + ("EfAc", true)) } func testMatchingOptionsScope() { From d49cb633983d8f998dc120aa1b5a3fd3d10ec861 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 18 Feb 2022 15:46:21 -0600 Subject: [PATCH 3/3] Toggle to use isCaseInsensitive internally --- Sources/_StringProcessing/ByteCodeGen.swift | 20 +++++------ .../_StringProcessing/ConsumerInterface.swift | 34 +++++++++---------- .../_StringProcessing/MatchingOptions.swift | 4 +-- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index f9d6a4b18..c4427b4bd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -134,23 +134,21 @@ extension Compiler.ByteCodeGen { mutating func emitScalar(_ s: UnicodeScalar) throws { // TODO: Native instruction buildMatchScalar(s) - if options.isCaseSensitive { + if options.isCaseInsensitive { + // TODO: e.g. buildCaseInsensitiveMatchScalar(s) builder.buildConsume(by: consumeScalar { - $0 == s + $0.properties.lowercaseMapping == s.properties.lowercaseMapping }) } else { - // TODO: e.g. buildCaseInsensitiveMatchScalar(s) builder.buildConsume(by: consumeScalar { - $0.properties.lowercaseMapping == s.properties.lowercaseMapping + $0 == s }) } } mutating func emitCharacter(_ c: Character) throws { // FIXME: Does semantic level matter? - if options.isCaseSensitive || !c.isCased { - builder.buildMatch(c) - } else { + if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in let inputChar = input[bounds.lowerBound].lowercased() @@ -159,6 +157,8 @@ extension Compiler.ByteCodeGen { ? input.index(after: bounds.lowerBound) : nil } + } else { + builder.buildMatch(c) } } @@ -539,9 +539,7 @@ extension Compiler.ByteCodeGen { case let .quotedLiteral(s): // TODO: Should this incorporate options? - if options.isCaseSensitive { - builder.buildMatchSequence(s) - } else { + if options.isCaseInsensitive { // TODO: buildCaseInsensitiveMatchSequence(c) or alternative builder.buildConsume { input, bounds in var iterator = s.makeIterator() @@ -554,6 +552,8 @@ extension Compiler.ByteCodeGen { } return currentIndex } + } else { + builder.buildMatchSequence(s) } case let .regexLiteral(l): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 90c8d127c..19fc0bb0d 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -56,28 +56,28 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseSensitive = opts.isCaseSensitive + let isCaseInsensitive = opts.isCaseInsensitive switch self { case let .char(c): // TODO: Match level? return { input, bounds in let low = bounds.lowerBound - if isCaseSensitive || !c.isCased { - return input[low] == c + if isCaseInsensitive && c.isCased { + return input[low].lowercased() == c.lowercased() ? input.index(after: low) : nil } else { - return input[low].lowercased() == c.lowercased() + return input[low] == c ? input.index(after: low) : nil } } case let .scalar(s): return consumeScalar { - isCaseSensitive - ? $0 == s - : $0.properties.lowercaseMapping == s.properties.lowercaseMapping + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s } case .any: @@ -197,25 +197,25 @@ extension DSLTree.CustomCharacterClass.Member { throw Unsupported("\(high) in range") } - if opts.isCaseSensitive { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + if opts.isCaseInsensitive { + let lhsLower = lhs.lowercased() + let rhsLower = rhs.lowercased() + guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { + if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { // TODO: semantic level return input.index(after: curIdx) } return nil } } else { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } return { input, bounds in // TODO: check for out of bounds? let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { + if (lhs...rhs).contains(input[curIdx]) { // TODO: semantic level return input.index(after: curIdx) } @@ -263,16 +263,16 @@ extension DSLTree.CustomCharacterClass.Member { return rhs(input, bounds) } case .quotedLiteral(let s): - if opts.isCaseSensitive { + if opts.isCaseInsensitive { return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { + guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else { return nil } return input.index(after: bounds.lowerBound) } } else { return { input, bounds in - guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else { + guard s.contains(input[bounds.lowerBound]) else { return nil } return input.index(after: bounds.lowerBound) diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index 37a45df2e..c7e190824 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -55,8 +55,8 @@ extension MatchingOptions { // MARK: Matching behavior API extension MatchingOptions { - var isCaseSensitive: Bool { - !stack.last!.contains(.caseInsensitive) + var isCaseInsensitive: Bool { + stack.last!.contains(.caseInsensitive) } var isReluctantByDefault: Bool {