diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index e09616363..c4427b4bd 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -20,15 +20,11 @@ extension Compiler.ByteCodeGen { emitAny() case let .char(c): - // FIXME: Does semantic level matter? - builder.buildMatch(c) - + try emitCharacter(c) + case let .scalar(s): - // TODO: Native instruction - builder.buildConsume(by: consumeScalar { - $0 == s - }) - + try emitScalar(s) + case let .assertion(kind): try emitAssertion(kind) @@ -135,6 +131,36 @@ extension Compiler.ByteCodeGen { } } } + + mutating func emitScalar(_ s: UnicodeScalar) throws { + // TODO: Native instruction buildMatchScalar(s) + if options.isCaseInsensitive { + // TODO: e.g. buildCaseInsensitiveMatchScalar(s) + builder.buildConsume(by: consumeScalar { + $0.properties.lowercaseMapping == s.properties.lowercaseMapping + }) + } else { + builder.buildConsume(by: consumeScalar { + $0 == s + }) + } + } + + mutating func emitCharacter(_ c: Character) throws { + // FIXME: Does semantic level matter? + if options.isCaseInsensitive && c.isCased { + // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) + builder.buildConsume { input, bounds in + let inputChar = input[bounds.lowerBound].lowercased() + let matchChar = c.lowercased() + return inputChar == matchChar + ? input.index(after: bounds.lowerBound) + : nil + } + } else { + builder.buildMatch(c) + } + } mutating func emitAny() { switch (options.semanticLevel, options.dotMatchesNewline) { @@ -513,7 +539,22 @@ extension Compiler.ByteCodeGen { case let .quotedLiteral(s): // TODO: Should this incorporate options? - builder.buildMatchSequence(s) + if options.isCaseInsensitive { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } else { + builder.buildMatchSequence(s) + } case let .regexLiteral(l): try emitNode(l.dslTreeNode) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b6cdbba5b..19fc0bb0d 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -51,24 +51,34 @@ extension DSLTree.Node { } extension DSLTree.Atom { - // TODO: If ByteCodeGen switches first, then this is - // unnecessary... + // TODO: If ByteCodeGen switches first, then this is unnecessary for + // top-level nodes, but it's also invoked for `.atom` members of a custom CC func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { + let isCaseInsensitive = opts.isCaseInsensitive + switch self { - case let .char(c): // TODO: Match level? return { input, bounds in let low = bounds.lowerBound - guard input[low] == c else { - return nil + if isCaseInsensitive && c.isCased { + return input[low].lowercased() == c.lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == c + ? input.index(after: low) + : nil } - return input.index(after: low) } case let .scalar(s): - return consumeScalar { $0 == s } + return consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + } case .any: // FIXME: Should this be a total ordering? @@ -187,14 +197,30 @@ extension DSLTree.CustomCharacterClass.Member { throw Unsupported("\(high) in range") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + if opts.isCaseInsensitive { + let lhsLower = lhs.lowercased() + let rhsLower = rhs.lowercased() + guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + return { input, bounds in + // TODO: check for out of bounds? + let curIdx = bounds.lowerBound + if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { + // TODO: semantic level + return input.index(after: curIdx) + } + return nil + } + } else { + guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } + return { input, bounds in + // TODO: check for out of bounds? + let curIdx = bounds.lowerBound + if (lhs...rhs).contains(input[curIdx]) { + // TODO: semantic level + return input.index(after: curIdx) + } + return nil } - return nil } case let .custom(ccc): @@ -237,11 +263,20 @@ extension DSLTree.CustomCharacterClass.Member { return rhs(input, bounds) } case .quotedLiteral(let s): - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + if opts.isCaseInsensitive { + return { input, bounds in + guard s.lowercased().contains(input[bounds.lowerBound].lowercased()) else { + return nil + } + return input.index(after: bounds.lowerBound) + } + } else { + return { input, bounds in + guard s.contains(input[bounds.lowerBound]) else { + return nil + } + return input.index(after: bounds.lowerBound) } - return input.index(after: bounds.lowerBound) } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Sources/_StringProcessing/MatchingOptions.swift b/Sources/_StringProcessing/MatchingOptions.swift index dcdddc4a8..c7e190824 100644 --- a/Sources/_StringProcessing/MatchingOptions.swift +++ b/Sources/_StringProcessing/MatchingOptions.swift @@ -25,7 +25,7 @@ struct MatchingOptions { } } -// Compiler API +// MARK: Compilation API extension MatchingOptions { /// Creates an instance with the default options. init() { @@ -51,7 +51,14 @@ extension MatchingOptions { stack[stack.count - 1].apply(sequence) _invariantCheck() } +} +// MARK: Matching behavior API +extension MatchingOptions { + var isCaseInsensitive: Bool { + stack.last!.contains(.caseInsensitive) + } + var isReluctantByDefault: Bool { stack.last!.contains(.reluctantByDefault) } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 088deb151..49e34d50f 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -113,7 +113,9 @@ func matchTest( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #file, + line: UInt = #line ) { for (test, expect) in tests { firstMatchTest( @@ -123,7 +125,9 @@ func matchTest( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -1117,6 +1121,43 @@ extension RegexTests { firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + func testCaseSensitivity() { + matchTest( + #"c..e"#, + ("cafe", true), + ("Cafe", false)) + matchTest( + #"(?i)c.f."#, + ("cafe", true), + ("Cafe", true), + ("caFe", true)) + matchTest( + #"(?i)cafe"#, + ("cafe", true), + ("Cafe", true), + ("caFe", true)) + matchTest( + #"(?i)café"#, + ("café", true), + ("CafÉ", true)) + matchTest( + #"(?i)\u{63}af\u{e9}"#, + ("café", true), + ("CafÉ", true)) + + matchTest( + #"[caFE]{4}"#, + ("cafe", false), + ("CAFE", false), + ("caFE", true), + ("EFac", true)) + matchTest( + #"(?i)[caFE]{4}"#, + ("cafe", true), + ("CaFe", true), + ("EfAc", true)) + } + func testMatchingOptionsScope() { // `.` only matches newlines when the 's' option (single-line mode) // is turned on. Standalone option-setting groups (e.g. `(?s)`) are