diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 9b9f02ff7..3cfbdcbd1 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -8,8 +8,16 @@ extension Compiler { /// This is used to determine whether to apply initial options. var hasEmittedFirstMatchableAtom = false - init(options: MatchingOptions, captureList: CaptureList) { + private let compileOptions: CompileOptions + fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) } + + init( + options: MatchingOptions, + compileOptions: CompileOptions, + captureList: CaptureList + ) { self.options = options + self.compileOptions = compileOptions self.builder.captureList = captureList } } @@ -643,8 +651,16 @@ fileprivate extension Compiler.ByteCodeGen { mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { - let consumer = try ccc.generateConsumer(options) - builder.buildConsume(by: consumer) + if let asciiBitset = ccc.asAsciiBitset(options), + options.semanticLevel == .graphemeCluster, + optimizationsEnabled { + // future work: add a bit to .matchBitset to consume either a character + // or a scalar so we can have this optimization in scalar mode + builder.buildMatchAsciiBitset(asciiBitset) + } else { + let consumer = try ccc.generateConsumer(options) + builder.buildConsume(by: consumer) + } } @discardableResult diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 8961a1b88..c834aa95e 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -16,6 +16,7 @@ class Compiler { // TODO: Or are these stored on the tree? var options = MatchingOptions() + private var compileOptions: CompileOptions = .default init(ast: AST) { self.tree = ast.dslTree @@ -25,23 +26,22 @@ class Compiler { self.tree = tree } + init(tree: DSLTree, compileOptions: CompileOptions) { + self.tree = tree + self.compileOptions = compileOptions + } + __consuming func emit() throws -> MEProgram { // TODO: Handle global options var codegen = ByteCodeGen( - options: options, captureList: tree.captureList - ) + options: options, + compileOptions: + compileOptions, + captureList: tree.captureList) return try codegen.emitRoot(tree.root) } } -func _compileRegex( - _ regex: String, _ syntax: SyntaxOptions = .traditional -) throws -> Executor { - let ast = try parse(regex, .semantic, syntax) - let program = try Compiler(ast: ast).emit() - return Executor(program: program) -} - // An error produced when compiling a regular expression. enum RegexCompilationError: Error, CustomStringConvertible { // TODO: Source location? @@ -54,3 +54,35 @@ enum RegexCompilationError: Error, CustomStringConvertible { } } } + +// Testing support +@available(SwiftStdlib 5.7, *) +func _compileRegex( + _ regex: String, + _ syntax: SyntaxOptions = .traditional, + _ semanticLevel: RegexSemanticLevel? = nil +) throws -> Executor { + let ast = try parse(regex, .semantic, syntax) + let dsl: DSLTree + + switch semanticLevel?.base { + case .graphemeCluster: + let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)]) + dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) + case .unicodeScalar: + let sequence = AST.MatchingOptionSequence(adding: [.init(.unicodeScalarSemantics, location: .fake)]) + dsl = DSLTree(.nonCapturingGroup(.init(ast: .changeMatchingOptions(sequence)), ast.dslTree.root)) + case .none: + dsl = ast.dslTree + } + let program = try Compiler(tree: dsl).emit() + return Executor(program: program) +} + +extension Compiler { + struct CompileOptions: OptionSet { + let rawValue: Int + static let disableOptimizations = CompileOptions(rawValue: 1) + static let `default`: CompileOptions = [] + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 07757eb6a..1fa3514bb 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -51,7 +51,26 @@ extension DSLTree.Node { } } +extension DSLTree._AST.Atom { + var singleScalarASCIIValue: UInt8? { + return ast.singleScalarASCIIValue + } +} + extension DSLTree.Atom { + var singleScalarASCIIValue: UInt8? { + switch self { + case let .char(c) where c != "\r\n": + return c.asciiValue + case let .scalar(s) where s.isASCII: + return UInt8(ascii: s) + case let .unconverted(atom): + return atom.singleScalarASCIIValue + default: + return nil + } + } + // TODO: If ByteCodeGen switches first, then this is unnecessary for // top-level nodes, but it's also invoked for `.atom` members of a custom CC func generateConsumer( @@ -61,17 +80,32 @@ extension DSLTree.Atom { switch self { case let .char(c): - // TODO: Match level? - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil + if opts.semanticLevel == .graphemeCluster { + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && c.isCased { + return input[low].lowercased() == c.lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == c + ? input.index(after: low) + : nil + } + } + } else { + let consumers = c.unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil } } case let .scalar(s): @@ -177,7 +211,18 @@ extension AST.Atom { default: return nil } } - + + var singleScalarASCIIValue: UInt8? { + switch kind { + case let .char(c) where c != "\r\n": + return c.asciiValue + case let .scalar(s) where s.value.isASCII: + return UInt8(ascii: s.value) + default: + return nil + } + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { @@ -235,6 +280,34 @@ extension AST.Atom { } extension DSLTree.CustomCharacterClass.Member { + func asAsciiBitset( + _ opts: MatchingOptions, + _ isInverted: Bool + ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + switch self { + case let .atom(a): + if let val = a.singleScalarASCIIValue { + return DSLTree.CustomCharacterClass.AsciiBitset( + val, + isInverted, + opts.isCaseInsensitive + ) + } + case let .range(low, high): + if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { + return DSLTree.CustomCharacterClass.AsciiBitset( + low: lowVal, + high: highVal, + isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive + ) + } + default: + return nil + } + return nil + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { @@ -342,6 +415,19 @@ extension DSLTree.CustomCharacterClass.Member { } extension DSLTree.CustomCharacterClass { + func asAsciiBitset(_ opts: MatchingOptions) -> AsciiBitset? { + return members.reduce( + .init(isInverted: isInverted), + {result, member in + if let next = member.asAsciiBitset(opts, isInverted) { + return result?.union(next) + } else { + return nil + } + } + ) + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index 23d003918..c614e10fd 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -50,6 +50,7 @@ extension Instruction.Payload { case bool(BoolRegister) case element(ElementRegister) case consumer(ConsumeFunctionRegister) + case bitset(AsciiBitsetRegister) case assertion(AssertionFunctionRegister) case addr(InstructionAddress) case capture(CaptureRegister) @@ -196,6 +197,13 @@ extension Instruction.Payload { interpret() } + init(bitset: AsciiBitsetRegister) { + self.init(bitset) + } + var bitset: AsciiBitsetRegister { + interpret() + } + init(consumer: ConsumeFunctionRegister) { self.init(consumer) } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 9d2ae5a69..4e715ad9d 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -84,6 +84,10 @@ extension Instruction { /// Operand: Sequence register to compare against. case matchSequence + /// Match against a set of valid ascii values stored in a bitset + /// Operand: Ascii bitset register containing the bitset + case matchBitset + /// TODO: builtin assertions and anchors case builtinAssertion diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index f278b7328..676b21473 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -18,6 +18,7 @@ extension MEProgram { var elements = TypedSetVector() var sequences = TypedSetVector<[Input.Element], _SequenceRegister>() + var asciiBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] = [] var consumeFunctions: [ConsumeFunction] = [] var assertionFunctions: [AssertionFunction] = [] var transformFunctions: [TransformFunction] = [] @@ -147,6 +148,13 @@ extension MEProgram.Builder { .init(sequence: sequences.store(.init(s))))) } + mutating func buildMatchAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) { + instructions.append(.init( + .matchBitset, .init(bitset: makeAsciiBitset(b)))) + } + mutating func buildConsume( by p: @escaping MEProgram.ConsumeFunction ) { @@ -273,6 +281,7 @@ extension MEProgram.Builder { regInfo.sequences = sequences.count regInfo.ints = nextIntRegister.rawValue regInfo.values = nextValueRegister.rawValue + regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count regInfo.assertionFunctions = assertionFunctions.count regInfo.transformFunctions = transformFunctions.count @@ -283,6 +292,7 @@ extension MEProgram.Builder { instructions: InstructionList(instructions), staticElements: elements.stored, staticSequences: sequences.stored, + staticBitsets: asciiBitsets, staticConsumeFunctions: consumeFunctions, staticAssertionFunctions: assertionFunctions, staticTransformFunctions: transformFunctions, @@ -414,6 +424,13 @@ extension MEProgram.Builder { // TODO: A register-mapping helper struct, which could release // registers without monotonicity required + mutating func makeAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) -> AsciiBitsetRegister { + defer { asciiBitsets.append(b) } + return AsciiBitsetRegister(asciiBitsets.count) + } + mutating func makeConsumeFunction( _ f: @escaping MEProgram.ConsumeFunction ) -> ConsumeFunctionRegister { diff --git a/Sources/_StringProcessing/Engine/MEProgram.swift b/Sources/_StringProcessing/Engine/MEProgram.swift index 2a0ec2719..eb2c119f2 100644 --- a/Sources/_StringProcessing/Engine/MEProgram.swift +++ b/Sources/_StringProcessing/Engine/MEProgram.swift @@ -26,6 +26,7 @@ struct MEProgram { var staticElements: [Input.Element] var staticSequences: [[Input.Element]] + var staticBitsets: [DSLTree.CustomCharacterClass.AsciiBitset] var staticConsumeFunctions: [ConsumeFunction] var staticAssertionFunctions: [AssertionFunction] var staticTransformFunctions: [TransformFunction] diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 8fa3716b9..a5368138c 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -226,6 +226,20 @@ extension Processor { } return true } + + // If we have a bitset we know that the CharacterClass only matches against + // ascii characters, so check if the current input element is ascii then + // check if it is set in the bitset + mutating func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let cur = load(), bitset.matches(char: cur) else { + signalFailure() + return false + } + _uncheckedForcedConsumeOne() + return true + } mutating func signalFailure() { guard let (pc, pos, stackEnd, capEnds, intRegisters) = @@ -364,6 +378,13 @@ extension Processor { controller.step() } + case .matchBitset: + let reg = payload.bitset + let bitset = registers[reg] + if matchBitset(bitset) { + controller.step() + } + case .consumeBy: let reg = payload.consumer guard currentPosition < searchBounds.upperBound, diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index fa3039cf0..c76413383 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -28,6 +28,8 @@ extension Processor { // // TODO: Degenericize Processor and store Strings var sequences: [[Element]] = [] + + var bitsets: [DSLTree.CustomCharacterClass.AsciiBitset] var consumeFunctions: [MEProgram.ConsumeFunction] @@ -67,6 +69,11 @@ extension Processor.Registers { subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } + subscript( + _ i: AsciiBitsetRegister + ) -> DSLTree.CustomCharacterClass.AsciiBitset { + bitsets[i.rawValue] + } subscript(_ i: ConsumeFunctionRegister) -> MEProgram.ConsumeFunction { consumeFunctions[i.rawValue] } @@ -94,6 +101,9 @@ extension Processor.Registers { self.sequences = program.staticSequences assert(sequences.count == info.sequences) + self.bitsets = program.staticBitsets + assert(bitsets.count == info.bitsets) + self.consumeFunctions = program.staticConsumeFunctions assert(consumeFunctions.count == info.consumeFunctions) @@ -133,6 +143,7 @@ extension MEProgram { var sequences = 0 var bools = 0 var strings = 0 + var bitsets = 0 var consumeFunctions = 0 var assertionFunctions = 0 var transformFunctions = 0 diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index b96ccda58..e470b3c97 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -80,20 +80,23 @@ extension Regex { /// likely, compilation/caching. let tree: DSLTree + /// OptionSet of compiler options for testing purposes + fileprivate var compileOptions: Compiler.CompileOptions = .default + private final class ProgramBox { let value: MEProgram init(_ value: MEProgram) { self.value = value } } /// Do not use directly - all accesses must go through `loweredProgram`. - private var _loweredProgramStorage: AnyObject? = nil + fileprivate var _loweredProgramStorage: AnyObject? = nil /// The program for execution with the matching engine. var loweredProgram: MEProgram { if let loweredObject = _loweredProgramStorage as? ProgramBox { return loweredObject.value } - let lowered = try! Compiler(tree: tree).emit() + let lowered = try! Compiler(tree: tree, compileOptions: compileOptions).emit() _stdlib_atomicInitializeARCRef(object: &_loweredProgramStorage, desired: ProgramBox(lowered)) return lowered } @@ -132,3 +135,11 @@ extension Regex { self.program = Program(tree: .init(node)) } } + +@available(SwiftStdlib 5.7, *) +extension Regex { + internal mutating func _setCompilerOptionsForTesting(_ opts: Compiler.CompileOptions) { + program.compileOptions = opts + program._loweredProgramStorage = nil + } +} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 72c5f1526..79758a601 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -164,6 +164,86 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } + + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + internal func matches(char: Character) -> Bool { + let ret: Bool + if let val = char.asciiValue { + if val < 64 { + ret = (a >> val) & 1 == 1 + } else { + ret = (b >> (val - 64)) & 1 == 1 + } + } else { + ret = false + } + + if isInverted { + return !ret + } + + return ret + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/TypedInt.swift b/Sources/_StringProcessing/Utility/TypedInt.swift index 249717b68..adc9edf78 100644 --- a/Sources/_StringProcessing/Utility/TypedInt.swift +++ b/Sources/_StringProcessing/Utility/TypedInt.swift @@ -134,6 +134,10 @@ enum _BoolRegister {} typealias StringRegister = TypedInt<_StringRegister> enum _StringRegister {} +/// Used for matching sets of ascii values via bitsets +typealias AsciiBitsetRegister = TypedInt<_AsciiBitsetRegister> +enum _AsciiBitsetRegister {} + /// Used for consume functions, e.g. character classes typealias ConsumeFunctionRegister = TypedInt<_ConsumeFunctionRegister> enum _ConsumeFunctionRegister {} diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 05cbcfa73..4e64f7335 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -142,4 +142,70 @@ extension RegexTests { "((?i:.))", matchingOptions(adding: [.caseInsensitive])) } + + private func expectProgram( + for regex: String, + syntax: SyntaxOptions = .traditional, + semanticLevel: RegexSemanticLevel? = nil, + contains targets: Set, + file: StaticString = #file, + line: UInt = #line + ) { + do { + let prog = try _compileRegex(regex, syntax, semanticLevel) + var found: Set = [] + for inst in prog.engine.instructions { + if targets.contains(inst.opcode) { + found.insert(inst.opcode) + } + } + + if !found.isSuperset(of: targets) { + XCTFail( + "Compiled regex '\(regex)' did not contain desired opcodes. Wanted: \(targets), found: \(found)", + file: file, + line: line) + } + } catch { + XCTFail( + "Failed to compile regex '\(regex)': \(error)", + file: file, + line: line) + } + } + + private func expectProgram( + for regex: String, + syntax: SyntaxOptions = .traditional, + semanticLevel: RegexSemanticLevel? = nil, + doesNotContain targets: Set, + file: StaticString = #file, + line: UInt = #line + ) { + do { + let prog = try _compileRegex(regex, syntax, semanticLevel) + for inst in prog.engine.instructions { + if targets.contains(inst.opcode) { + XCTFail( + "Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)", + file: file, + line: line) + return + } + } + } catch { + XCTFail( + "Failed to compile regex '\(regex)': \(error)", + file: file, + line: line) + } + } + + func testBitsetCompile() { + expectProgram(for: "[abc]", contains: [.matchBitset]) + expectProgram(for: "[abc]", doesNotContain: [.consumeBy]) + + expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) + expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) + } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 6bf7986ed..51da6d010 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -14,22 +14,34 @@ import XCTest @testable import _StringProcessing struct MatchError: Error { - var message: String - init(_ message: String) { - self.message = message - } + var message: String + init(_ message: String) { + self.message = message + } } func _firstMatch( _ regexStr: String, input: String, + validateOptimizations: Bool, syntax: SyntaxOptions = .traditional ) throws -> (String, [String?]) { - let regex = try Regex(regexStr, syntax: syntax) + var regex = try Regex(regexStr, syntax: syntax) guard let result = try regex.firstMatch(in: input) else { throw MatchError("match not found for \(regexStr) in \(input)") } let caps = result.output.slices(from: input) + + if validateOptimizations { + regex._setCompilerOptionsForTesting(.disableOptimizations) + guard let unoptResult = try regex.firstMatch(in: input) else { + throw MatchError("match not found for unoptimized \(regexStr) in \(input)") + } + XCTAssertEqual( + String(input[result.range]), + String(input[unoptResult.range]), + "Unoptimized regex returned a different result") + } return (String(input[result.range]), caps.map { $0.map(String.init) }) } @@ -41,6 +53,7 @@ func flatCaptureTest( syntax: SyntaxOptions = .traditional, dumpAST: Bool = false, xfail: Bool = false, + validateOptimizations: Bool = true, file: StaticString = #file, line: UInt = #line ) { @@ -49,6 +62,7 @@ func flatCaptureTest( guard var (_, caps) = try? _firstMatch( regex, input: test, + validateOptimizations: validateOptimizations, syntax: syntax ) else { if expect == nil { @@ -98,6 +112,7 @@ func matchTest( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + validateOptimizations: Bool = true, file: StaticString = #file, line: UInt = #line ) { @@ -110,6 +125,7 @@ func matchTest( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + validateOptimizations: validateOptimizations, file: file, line: line) } @@ -126,6 +142,7 @@ func firstMatchTest( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + validateOptimizations: Bool = true, file: StaticString = #filePath, line: UInt = #line ) { @@ -133,6 +150,7 @@ func firstMatchTest( let (found, _) = try _firstMatch( regex, input: input, + validateOptimizations: validateOptimizations, syntax: syntax) if xfail { @@ -571,6 +589,44 @@ extension RegexTests { // Character class subtraction firstMatchTest("[a-d--a-c]", input: "123abcdxyz", match: "d") + // Inverted character class + matchTest(#"[^a]"#, + ("💿", true), + ("a\u{301}", true), + ("A", true), + ("a", false)) + + matchTest("[a]", + ("a\u{301}", false)) + + // CR-LF special case: \r\n is a single character with ascii value equal + // to \n, so make sure the ascii bitset optimization handles this correctly + matchTest("[\r\n]", + ("\r\n", true), + ("\n", false), + ("\r", false)) + // check that in scalar mode this case is handled correctly + // in scalar semantics the character "\r\n" in the character class is + // interpreted as matching the scalars "\r" or "\n". + // It does not fully match the character "\r\n" because the character class + // in scalar mode will only match one scalar + do { + let regex = try Regex("[\r\n]").matchingSemantics(.unicodeScalar) + XCTAssertEqual("\r", try regex.wholeMatch(in: "\r")?.0) + XCTAssertEqual("\n", try regex.wholeMatch(in: "\n")?.0) + XCTAssertEqual(nil, try regex.wholeMatch(in: "\r\n")?.0) + } catch { + XCTFail("\(error)", file: #filePath, line: #line) + } + + matchTest("[^\r\n]", + ("\r\n", false), + ("\n", true), + ("\r", true)) + matchTest("[\n\r]", + ("\n", true), + ("\r", true)) + firstMatchTest("[-]", input: "123-abcxyz", match: "-") // These are metacharacters in certain contexts, but normal characters diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index be01fecb3..fa8a1729d 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -389,10 +389,12 @@ extension UTS18Tests { func testCharacterClassesWithStrings() { let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) - XCTAssertTrue("🧐".contains(regex)) - XCTAssertTrue("🇧🇫".contains(regex)) - XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar))) - XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar))) + XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) + XCTAssertEqual("🇧🇫", "🇧🇫".wholeMatch(of: regex)?.0) + XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex.matchingSemantics(.unicodeScalar))?.0) + XCTAssertEqual(nil, "🇧🇫".wholeMatch(of: regex.matchingSemantics(.unicodeScalar))?.0) + XCTAssertEqual("🧐", "🧐".firstMatch(of: regex.matchingSemantics(.unicodeScalar))?.0) + XCTAssertEqual("\u{1f1e7}", "🇧🇫".firstMatch(of: regex.matchingSemantics(.unicodeScalar))?.0) } // RL2.3 Default Word Boundaries