diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 6263186e8..477760ef8 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -65,10 +65,14 @@ fileprivate extension Compiler.ByteCodeGen { emitDot() case let .char(c): - try emitCharacter(c) + emitCharacter(c) case let .scalar(s): - try emitScalar(s) + if options.semanticLevel == .graphemeCluster { + emitCharacter(Character(s)) + } else { + emitMatchScalar(s) + } case let .assertion(kind): try emitAssertion(kind) @@ -94,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitQuotedLiteral(_ s: String) { + guard options.semanticLevel == .graphemeCluster else { + for char in s { + for scalar in char.unicodeScalars { + emitMatchScalar(scalar) + } + } + return + } + + // Fast path for eliding boundary checks for an all ascii quoted literal + if optimizationsEnabled && s.allSatisfy(\.isASCII) { + let lastIdx = s.unicodeScalars.indices.last! + for idx in s.unicodeScalars.indices { + let boundaryCheck = idx == lastIdx + let scalar = s.unicodeScalars[idx] + if options.isCaseInsensitive && scalar.properties.isCased { + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + } else { + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + } + } + return + } + + for c in s { emitCharacter(c) } + } + mutating func emitBackreference( _ ref: AST.Reference ) throws { @@ -257,41 +289,47 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitScalar(_ s: UnicodeScalar) throws { - // TODO: Native instruction buildMatchScalar(s) - if options.isCaseInsensitive { - // TODO: e.g. buildCaseInsensitiveMatchScalar(s) - builder.buildConsume(by: consumeScalar { - $0.properties.lowercaseMapping == s.properties.lowercaseMapping - }) + mutating func emitMatchScalar(_ s: UnicodeScalar) { + assert(options.semanticLevel == .unicodeScalar) + if options.isCaseInsensitive && s.properties.isCased { + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) } else { - builder.buildConsume(by: consumeScalar { - $0 == s - }) + builder.buildMatchScalar(s, boundaryCheck: false) } } - mutating func emitCharacter(_ c: Character) throws { - // Unicode scalar matches the specific scalars that comprise a character + mutating func emitCharacter(_ c: Character) { + // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { for scalar in c.unicodeScalars { - try emitScalar(scalar) + emitMatchScalar(scalar) } return } if options.isCaseInsensitive && c.isCased { - // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) - builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar - ? input.index(after: bounds.lowerBound) - : nil + if optimizationsEnabled && c.isASCII { + // c.isCased ensures that c is not CR-LF, + // so we know that c is a single scalar + assert(c.unicodeScalars.count == 1) + builder.buildMatchScalarCaseInsensitive( + c.unicodeScalars.last!, + boundaryCheck: true) + } else { + builder.buildMatch(c, isCaseInsensitive: true) } - } else { - builder.buildMatch(c) + return + } + + if optimizationsEnabled && c.isASCII { + let lastIdx = c.unicodeScalars.indices.last! + for idx in c.unicodeScalars.indices { + builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + } + return } + + builder.buildMatch(c, isCaseInsensitive: false) } mutating func emitAny() { @@ -567,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen { decrement %minTrips and fallthrough loop-body: + : + mov currentPosition %pos evaluate the subexpression + : + if %pos is currentPosition: + goto exit goto min-trip-count control block exit-policy control block: @@ -670,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen { // // branch min-trip-count builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + let emitPositionChecking = + (!optimizationsEnabled || !child.guaranteesForwardProgress) && + extraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } try emitNode(child) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or extraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + if minTrips <= 1 { // fallthrough } else { @@ -715,11 +779,12 @@ fileprivate extension Compiler.ByteCodeGen { _ ccc: DSLTree.CustomCharacterClass ) throws { if let asciiBitset = ccc.asAsciiBitset(options), - options.semanticLevel == .graphemeCluster, optimizationsEnabled { - // future work: add a bit to .matchBitset to consume either a character - // or a scalar so we can have this optimization in scalar mode - builder.buildMatchAsciiBitset(asciiBitset) + if options.semanticLevel == .unicodeScalar { + builder.buildScalarMatchAsciiBitset(asciiBitset) + } else { + builder.buildMatchAsciiBitset(asciiBitset) + } } else { let consumer = try ccc.generateConsumer(options) builder.buildConsume(by: consumer) @@ -796,45 +861,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - if options.semanticLevel == .graphemeCluster { - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() - var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) - } - return currentIndex - } - } else { - builder.buildMatchSequence(s) - } - } else { - builder.buildConsume { - [caseInsensitive = options.isCaseInsensitive] input, bounds in - // TODO: Case folding - var iterator = s.unicodeScalars.makeIterator() - var currentIndex = bounds.lowerBound - while let scalar = iterator.next() { - guard currentIndex < bounds.upperBound else { return nil } - if caseInsensitive { - if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { - return nil - } - } else { - if scalar != input.unicodeScalars[currentIndex] { - return nil - } - } - input.unicodeScalars.formIndex(after: ¤tIndex) - } - return currentIndex - } - } + emitQuotedLiteral(s) case let .convertedRegexLiteral(n, _): return try emitNode(n) @@ -856,3 +883,42 @@ fileprivate extension Compiler.ByteCodeGen { return nil } } + +extension DSLTree.Node { + var guaranteesForwardProgress: Bool { + switch self { + case .orderedChoice(let children): + return children.allSatisfy { $0.guaranteesForwardProgress } + case .concatenation(let children): + return children.contains(where: { $0.guaranteesForwardProgress }) + case .capture(_, _, let node, _): + return node.guaranteesForwardProgress + case .nonCapturingGroup(let kind, let child): + switch kind.ast { + case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: + return false + default: return child.guaranteesForwardProgress + } + case .atom(let atom): + switch atom { + case .changeMatchingOptions, .assertion: return false + default: return true + } + case .trivia, .empty: + return false + case .quotedLiteral(let string): + return !string.isEmpty + case .convertedRegexLiteral(let node, _): + return node.guaranteesForwardProgress + case .consumer, .matcher: + // Allow zero width consumers and matchers + return false + case .customCharacterClass: + return true + case .quantification(let amount, _, let child): + let (atLeast, _) = amount.ast.bounds + return atLeast ?? 0 > 0 && child.guaranteesForwardProgress + default: return false + } + } +} diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index fb9267f4f..668d16eb6 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// @@ -53,11 +60,50 @@ extension DSLTree._AST.Atom { } } +extension Character { + func generateConsumer( + _ opts: MatchingOptions + ) throws -> MEProgram.ConsumeFunction? { + let isCaseInsensitive = opts.isCaseInsensitive + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && isCased { + return input[low].lowercased() == lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == self + ? input.index(after: low) + : nil + } + } + case .unicodeScalar: + // TODO: This should only be reachable from character class emission, can + // we guarantee that? Otherwise we'd want a different matching behavior. + let consumers = unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + } + } +} + extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { - case let .char(c) where c != "\r\n": - return c.asciiValue + case let .char(c): + return c._singleScalarAsciiValue case let .scalar(s) where s.isASCII: return UInt8(ascii: s) case let .unconverted(atom): @@ -72,44 +118,15 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch self { case let .char(c): - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil - } - } - } else { - let consumers = c.unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } + return try c.generateConsumer(opts) + case let .scalar(s): - return consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - } + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + return try Character(s).generateConsumer(opts) case .any: // FIXME: Should this be a total ordering? @@ -230,16 +247,20 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { case .scalar(let s): return s.value + case .escaped(let e): + guard let s = e.scalarValue else { return nil } + return s default: return nil } } var singleScalarASCIIValue: UInt8? { + if let s = singleScalar, s.isASCII { + return UInt8(ascii: s) + } switch kind { - case let .char(c) where c != "\r\n": - return c.asciiValue - case let .scalar(s) where s.value.isASCII: - return UInt8(ascii: s.value) + case let .char(c): + return c._singleScalarAsciiValue default: return nil } diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 8fcdf9312..355702ac1 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -32,15 +32,18 @@ extension Processor { // The int registers store values that can be relevant to // backtracking, such as the number of trips in a quantification. var intRegisters: [Int] + // Same with position registers + var posRegisters: [Input.Index] var destructure: ( pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, captureEnds: [_StoredCapture], - intRegisters: [Int] + intRegisters: [Int], + PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters) + (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } @@ -53,7 +56,8 @@ extension Processor { pos: addressOnly ? nil : currentPosition, stackEnd: .init(callStack.count), captureEnds: storedCaptures, - intRegisters: registers.ints) + intRegisters: registers.ints, + posRegisters: registers.positions) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..42fb86913 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -147,6 +147,26 @@ extension Instruction.Payload { var string: StringRegister { interpret() } + + init(scalar: Unicode.Scalar) { + self.init(UInt64(scalar.value)) + } + var scalar: Unicode.Scalar { + return Unicode.Scalar(_value: UInt32(self.rawValue)) + } + + init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let raw = UInt64(scalar.value) + + (caseInsensitive ? 1 << 55: 0) + + (boundaryCheck ? 1 << 54 : 0) + self.init(raw) + } + var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let caseInsensitive = (self.rawValue >> 55) & 1 == 1 + let boundaryCheck = (self.rawValue >> 54) & 1 == 1 + let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF)) + return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck) + } init(sequence: SequenceRegister) { self.init(sequence) @@ -190,18 +210,20 @@ extension Instruction.Payload { interpret() } - init(element: ElementRegister) { - self.init(element) + init(element: ElementRegister, isCaseInsensitive: Bool) { + self.init(isCaseInsensitive ? 1 : 0, element) } - var element: ElementRegister { - interpret() + var elementPayload: (isCaseInsensitive: Bool, ElementRegister) { + let pair: (UInt64, ElementRegister) = interpretPair() + return (isCaseInsensitive: pair.0 == 1, pair.1) } - init(bitset: AsciiBitsetRegister) { - self.init(bitset) + init(bitset: AsciiBitsetRegister, isScalar: Bool) { + self.init(isScalar ? 1 : 0, bitset) } - var bitset: AsciiBitsetRegister { - interpret() + var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) { + let pair: (UInt64, AsciiBitsetRegister) = interpretPair() + return (isScalar: pair.0 == 1, pair.1) } init(consumer: ConsumeFunctionRegister) { @@ -284,10 +306,10 @@ extension Instruction.Payload { interpretPair() } - init(pos: PositionRegister, pos2: PositionRegister) { - self.init(pos, pos2) + init(addr: InstructionAddress, position: PositionRegister) { + self.init(addr, position) } - var pairedPosPos: (PositionRegister, PositionRegister) { + var pairedAddrPos: (InstructionAddress, PositionRegister) { interpretPair() } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..8e1a1f294 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -37,6 +37,14 @@ extension Instruction { /// case moveImmediate + /// Move the current position into a register + /// + /// moveCurrentPosition(into: PositionRegister) + /// + /// Operands: + /// - Position register to move into + case moveCurrentPosition + // MARK: General Purpose: Control flow /// Branch to a new instruction @@ -57,6 +65,16 @@ extension Instruction { /// case condBranchZeroElseDecrement + /// Conditionally branch if the current position is the same as the register + /// + /// condBranch( + /// to: InstAddr, ifSamePositionAs: PositionRegister) + /// + /// Operands: + /// - Instruction address to branch to, if the position in the register is the same as currentPosition + /// - Position register to check against + case condBranchSamePosition + // TODO: Function calls // MARK: - Matching @@ -72,20 +90,27 @@ extension Instruction { /// Composite assert-advance else restore. /// - /// match(_: EltReg) + /// match(_: EltReg, isCaseInsensitive: Bool) /// - /// Operand: Element register to compare against. + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way case match - /// Match against a sequence of elements + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// - /// matchSequence(_: SeqReg) + /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) /// - /// Operand: Sequence register to compare against. - case matchSequence + /// Operands: Scalar value to match against and booleans + case matchScalar - /// Match against a set of valid ascii values stored in a bitset - /// Operand: Ascii bitset register containing the bitset + /// Match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value case matchBitset /// TODO: builtin assertions and anchors @@ -306,7 +331,7 @@ extension Instruction { var elementRegister: ElementRegister? { switch opcode { case .match: - return payload.element + return payload.elementPayload.1 default: return nil } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..0b9a91726 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -32,6 +32,7 @@ extension MEProgram { var nextIntRegister = IntRegister(0) var nextCaptureRegister = CaptureRegister(0) var nextValueRegister = ValueRegister(0) + var nextPositionRegister = PositionRegister(0) // Special addresses or instructions var failAddressToken: AddressToken? = nil @@ -105,6 +106,14 @@ extension MEProgram.Builder { fixup(to: t) } + mutating func buildCondBranch( + to t: AddressToken, + ifSamePositionAs r: PositionRegister + ) { + instructions.append(.init(.condBranchSamePosition, .init(position: r))) + fixup(to: t) + } + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) @@ -135,24 +144,32 @@ extension MEProgram.Builder { instructions.append(.init(.advance, .init(distance: n))) } - mutating func buildMatch(_ e: Character) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { instructions.append(.init( - .match, .init(element: elements.store(e)))) + .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchSequence( - _ s: S - ) where S.Element == Character { - instructions.append(.init( - .matchSequence, - .init(sequence: sequences.store(.init(s))))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) + } + + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) } + mutating func buildMatchAsciiBitset( _ b: DSLTree.CustomCharacterClass.AsciiBitset ) { instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b)))) + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + } + + mutating func buildScalarMatchAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) { + instructions.append(.init( + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } mutating func buildConsume( @@ -211,6 +228,10 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } + mutating func buildMoveCurrentPosition(into r: PositionRegister) { + instructions.append(.init(.moveCurrentPosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister ) { @@ -257,7 +278,8 @@ extension MEProgram.Builder { switch inst.opcode { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - + case .condBranchSamePosition: + payload = .init(addr: addr, position: inst.payload.position) case .branch, .save, .saveAddress, .clearThrough: payload = .init(addr: addr) @@ -281,6 +303,7 @@ extension MEProgram.Builder { regInfo.sequences = sequences.count regInfo.ints = nextIntRegister.rawValue regInfo.values = nextValueRegister.rawValue + regInfo.positions = nextPositionRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count regInfo.assertionFunctions = assertionFunctions.count @@ -421,6 +444,12 @@ extension MEProgram.Builder { return r } + mutating func makePositionRegister() -> PositionRegister { + let r = nextPositionRegister + defer { nextPositionRegister.rawValue += 1 } + return r + } + // TODO: A register-mapping helper struct, which could release // registers without monotonicity required diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index f7b3a65a2..2be918294 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,15 @@ extension Processor { return true } + mutating func matchCaseInsensitive(_ e: Element) -> Bool { + guard let cur = load(), cur.lowercased() == e.lowercased() else { + signalFailure() + return false + } + _uncheckedForcedConsumeOne() + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -230,6 +239,44 @@ extension Processor { return true } + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + guard s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + + mutating func matchScalarCaseInsensitive( + _ s: Unicode.Scalar, + boundaryCheck: Bool + ) -> Bool { + guard let curScalar = loadScalar(), + s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -244,8 +291,22 @@ extension Processor { return true } + // Equivalent of matchBitset but emitted when in unicode scalar semantic mode + mutating func matchBitsetScalar( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let curScalar = loadScalar(), + bitset.matches(scalar: curScalar), + let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters) = + guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure else { state = .fail @@ -259,6 +320,7 @@ extension Processor { callStack.removeLast(callStack.count - stackEnd.rawValue) storedCaptures = capEnds registers.ints = intRegisters + registers.positions = posRegisters } mutating func abort(_ e: Error? = nil) { @@ -315,7 +377,10 @@ extension Processor { registers[reg] = int controller.step() - + case .moveCurrentPosition: + let reg = payload.position + registers[reg] = currentPosition + controller.step() case .branch: controller.pc = payload.addr @@ -327,7 +392,13 @@ extension Processor { registers[int] -= 1 controller.step() } - + case .condBranchSamePosition: + let (addr, pos) = payload.pairedAddrPos + if registers[pos] == currentPosition { + controller.pc = addr + } else { + controller.step() + } case .save: let resumeAddr = payload.addr let sp = makeSavePoint(resumeAddr) @@ -369,23 +440,40 @@ extension Processor { } case .match: - let reg = payload.element - if match(registers[reg]) { - controller.step() + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + if matchCaseInsensitive(registers[reg]) { + controller.step() + } + } else { + if match(registers[reg]) { + controller.step() + } } - case .matchSequence: - let reg = payload.sequence - let seq = registers[reg] - if matchSeq(seq) { - controller.step() + case .matchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } + } else { + if matchScalar(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } } case .matchBitset: - let reg = payload.bitset + let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if matchBitset(bitset) { - controller.step() + if isScalar { + if matchBitsetScalar(bitset) { + controller.step() + } + } else { + if matchBitset(bitset) { + controller.step() + } } case .consumeBy: diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..e5d33af8b 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -47,6 +47,8 @@ extension Processor { var ints: [Int] var values: [Any] + + var positions: [Input.Index] } } @@ -66,6 +68,12 @@ extension Processor.Registers { values[i.rawValue] = newValue } } + subscript(_ i: PositionRegister) -> Input.Index { + get { positions[i.rawValue] } + set { + positions[i.rawValue] = newValue + } + } subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } @@ -89,6 +97,8 @@ extension Processor.Registers { } extension Processor.Registers { + static let sentinelIndex = "".startIndex + init( _ program: MEProgram, _ sentinel: String.Index @@ -120,11 +130,15 @@ extension Processor.Registers { self.values = Array( repeating: SentinelValue(), count: info.values) + self.positions = Array( + repeating: Processor.Registers.sentinelIndex, + count: info.positions) } mutating func reset(sentinel: Input.Index) { self.ints._setAll(to: 0) self.values._setAll(to: SentinelValue()) + self.positions._setAll(to: Processor.Registers.sentinelIndex) } } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 21c611d43..80f2e7697 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -315,8 +315,7 @@ extension PrettyPrinter { return } - var charMembers = "" - + var charMembers = StringLiteralBuilder() // This iterates through all of the character class members collecting all // of the members who can be stuffed into a singular '.anyOf(...)' vs. @@ -340,14 +339,10 @@ extension PrettyPrinter { switch a { case let .char(c): charMembers.append(c) - - if c == "\\" { - charMembers.append(c) - } - return false case let .scalar(s): - charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + charMembers.append( + unescaped: "\\u{\(String(s.value, radix: 16, uppercase: true))}") return false case .unconverted(_): return true @@ -356,7 +351,7 @@ extension PrettyPrinter { } case let .quotedLiteral(s): - charMembers += s + charMembers.append(s) return false case .trivia(_): @@ -370,7 +365,7 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - let anyOf = ".anyOf(\(charMembers._quoted))" + let anyOf = ".anyOf(\(charMembers))" indent() @@ -393,7 +388,7 @@ extension PrettyPrinter { printer.indent() if !charMembers.isEmpty { - printer.output(".anyOf(\(charMembers._quoted))") + printer.output(".anyOf(\(charMembers))") if nonCharMembers.count > 0 { printer.output(",") @@ -617,10 +612,39 @@ extension PrettyPrinter { } extension String { - // TODO: Escaping? + fileprivate var _escaped: String { + _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#) + } + fileprivate var _quoted: String { - "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\"" + _escaped._bareQuoted + } + + fileprivate var _bareQuoted: String { + #""\#(self)""# + } +} + +/// A helper for building string literals, which handles escaping the contents +/// appended. +fileprivate struct StringLiteralBuilder { + private var contents = "" + + var result: String { contents._bareQuoted } + var isEmpty: Bool { contents.isEmpty } + + mutating func append(_ str: String) { + contents += str._escaped + } + mutating func append(_ c: Character) { + contents += String(c)._escaped } + mutating func append(unescaped str: String) { + contents += str + } +} +extension StringLiteralBuilder: CustomStringConvertible { + var description: String { result } } extension DSLTree.Atom.Assertion { @@ -1121,8 +1145,8 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return ("\\u{\(hex)}"._quoted, false) - + return ("\\u{\(hex)}"._bareQuoted, false) + case let .unconverted(a): if a.ast.isUnprintableAtom { return ("#/\(a.ast._regexBase)/#", false) @@ -1169,7 +1193,7 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}"._quoted + return "\\u{\(hex)}"._bareQuoted case let .unconverted(a): return a.ast._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 2146fd61b..c4ac8e759 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -244,7 +244,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s.value)) + case let .scalar(s): return .scalar(s.value) case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 449baa6a7..4ea905fd5 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -159,86 +159,6 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 1cf039b35..84a2d11ad 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -13,6 +13,10 @@ import XCTest import _StringProcessing import RegexBuilder +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {} +#endif + class RegexDSLTests: XCTestCase { func _testDSLCaptures( _ tests: (input: String, expectedCaptures: MatchType?)..., @@ -1374,6 +1378,66 @@ class RegexDSLTests: XCTestCase { } } + func testScalarMatching() throws { + // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In + // grapheme cluster mode, it should only match entire graphemes. It may + // match a single scalar of a grapheme cluster in scalar semantic mode. + XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNotNil("a\u{301}".firstMatch( + of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) + + let r1 = Regex { + "a" as UnicodeScalar + } + XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r2 = Regex { + CharacterClass.anyOf(["a" as UnicodeScalar, "๐Ÿ‘"]) + } + XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r3 = Regex { + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘จ" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ง" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "๐Ÿ‘ฆ" as UnicodeScalar + } + XCTAssertNil(try r3.firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘งโ€๐Ÿ‘ฆ")) + + let r4 = Regex { "รฉ" as UnicodeScalar } + XCTAssertNotNil( + try r4.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r4.firstMatch(in: "รฉ") + ) + + try XCTExpectFailure("Need stronger scalar coalescing logic") { + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil( + try r5.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r5.firstMatch(in: "รฉ") + ) + } + } + struct SemanticVersion: Equatable { var major: Int var minor: Int diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 4e64f7335..6c8f66e10 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -14,6 +14,131 @@ import XCTest +enum DecodedInstr { + case invalid + case moveImmediate + case moveCurrentPosition + case branch + case condBranchZeroElseDecrement + case condBranchSamePosition + case save + case saveAddress + case splitSaving + case clear + case clearThrough + case accept + case fail + case advance + case match + case matchCaseInsensitive + case matchScalar + case matchScalarCaseInsensitiveUnchecked + case matchScalarCaseInsensitive + case matchScalarUnchecked + case matchBitsetScalar + case matchBitset + case consumeBy + case assertBy + case matchBy + case backreference + case beginCapture + case endCapture + case transformCapture + case captureValue + case builtinAssertion + case builtinCharacterClass +} + +extension DecodedInstr { + /// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions + /// like matchScalar and match into their variants + /// + /// Must stay in sync with Processor.cycle + static func decode(_ instruction: Instruction) -> DecodedInstr { + let (opcode, payload) = instruction.destructure + + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive + } else { + return .matchScalarCaseInsensitiveUnchecked + } + } else { + if boundaryCheck { + return .matchScalar + } else { + return .matchScalarUnchecked + } + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .builtinAssertion: + return .builtinAssertion + case .builtinCharacterClass: + return .builtinCharacterClass +} + } +} + extension RegexTests { private func testCompilationEquivalence( @@ -147,16 +272,24 @@ extension RegexTests { for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, - contains targets: Set, + contains targets: Set = [], + doesNotContain invalid: Set = [], file: StaticString = #file, line: UInt = #line ) { do { let prog = try _compileRegex(regex, syntax, semanticLevel) - var found: Set = [] + var found: Set = [] for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - found.insert(inst.opcode) + let decoded = DecodedInstr.decode(inst) + found.insert(decoded) + + if invalid.contains(decoded) { + XCTFail( + "Compiled regex '\(regex)' contains incorrect opcode \(decoded)", + file: file, + line: line) + return } } @@ -174,38 +307,130 @@ extension RegexTests { } } - private func expectProgram( - for regex: String, - syntax: SyntaxOptions = .traditional, - semanticLevel: RegexSemanticLevel? = nil, - doesNotContain targets: Set, - file: StaticString = #file, - line: UInt = #line - ) { - do { - let prog = try _compileRegex(regex, syntax, semanticLevel) - for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - XCTFail( - "Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)", - file: file, - line: line) - return - } - } - } catch { - XCTFail( - "Failed to compile regex '\(regex)': \(error)", - file: file, - line: line) - } + func testBitsetCompile() { + expectProgram( + for: "[abc]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[abc]", + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } - func testBitsetCompile() { - expectProgram(for: "[abc]", contains: [.matchBitset]) - expectProgram(for: "[abc]", doesNotContain: [.consumeBy]) + func testScalarOptimizeCompilation() { + // all ascii quoted literal -> elide boundary checks + expectProgram( + for: "abcd", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .consumeBy]) + // ascii character -> matchScalar with boundary check + expectProgram( + for: "a", + contains: [.matchScalar], + doesNotContain: [.match, .consumeBy, .matchScalarUnchecked]) + // quoted literal is not all ascii -> match scalar when possible, always do boundary checks + expectProgram( + for: "aaa\u{301}", + contains: [.match, .matchScalar], + doesNotContain: [.consumeBy, .matchScalarUnchecked]) + // scalar mode -> always emit match scalar without boundary checks + expectProgram( + for: "abcd", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "a", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "aaa\u{301}", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + } + + func testCaseInsensitivityCompilation() { + // quoted literal is all ascii -> match scalar case insensitive and skip + // boundary checks + expectProgram( + for: "(?i)abcd", + contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive], + doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked]) + // quoted literal is all non-cased ascii -> emit match scalar instructions + expectProgram( + for: "(?i)&&&&", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .matchCaseInsensitive, + .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked]) + // quoted literal is not all ascii -> match scalar case insensitive when + // possible, match character case insensitive when needed, always perform + // boundary check + expectProgram( + for: "(?i)abcd\u{301}", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar]) + // same as before but contains ascii non cased characters -> emit matchScalar for them + expectProgram( + for: "(?i)abcd\u{301};.'!", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match]) + // contains non-ascii non-cased characters -> emit match + expectProgram( + for: "(?i)abcd\u{301};.'!๐Ÿ’–", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + + // scalar mode -> emit unchecked scalar match only, emit case insensitive + // only if the scalar is cased + expectProgram( + for: "(?i);.'!๐Ÿ’–", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + expectProgram( + for: "(?i)abcdรฉ", + semanticLevel: .unicodeScalar, + contains: [.matchScalarCaseInsensitiveUnchecked], + doesNotContain: [.matchScalarUnchecked]) + } - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) + func testQuantificationForwardProgressCompile() { + // Unbounded quantification + non forward progressing inner nodes + // Expect to emit the position checking instructions + expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + + // Bounded quantification, don't emit position checking + expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + + // Inner node is a quantification that does not guarantee forward progress + expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + // Inner node is a quantification that guarantees forward progress + expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 8f7baf4b9..a8f7977d6 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -163,7 +163,7 @@ func firstMatchTest( if xfail { XCTAssertNotEqual(found, match, file: file, line: line) } else { - XCTAssertEqual(found, match, file: file, line: line) + XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { // FIXME: This allows non-matches to succeed even when xfail'd @@ -603,6 +603,12 @@ extension RegexTests { ("A", true), ("a", false)) + matchTest(#"(?i)[a]"#, + ("๐Ÿ’ฟ", false), + ("a\u{301}", false), + ("A", true), + ("a", true)) + matchTest("[a]", ("a\u{301}", false)) @@ -617,14 +623,12 @@ extension RegexTests { // interpreted as matching the scalars "\r" or "\n". // It does not fully match the character "\r\n" because the character class // in scalar mode will only match one scalar - do { - let regex = try Regex("[\r\n]").matchingSemantics(.unicodeScalar) - XCTAssertEqual("\r", try regex.wholeMatch(in: "\r")?.0) - XCTAssertEqual("\n", try regex.wholeMatch(in: "\n")?.0) - XCTAssertEqual(nil, try regex.wholeMatch(in: "\r\n")?.0) - } catch { - XCTFail("\(error)", file: #filePath, line: #line) - } + matchTest( + "^[\r\n]$", + ("\r", true), + ("\n", true), + ("\r\n", false), + semanticLevel: .unicodeScalar) matchTest("[^\r\n]", ("\r\n", false), @@ -632,7 +636,17 @@ extension RegexTests { ("\r", true)) matchTest("[\n\r]", ("\n", true), - ("\r", true)) + ("\r", true), + ("\r\n", false)) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" @@ -1903,6 +1917,19 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + // Testing the matchScalar optimization for ascii quoted literals and characters + func testScalarOptimization() throws { + // check that we are correctly doing the boundary check after matchScalar + firstMatchTest("a", input: "a\u{301}", match: nil) + firstMatchTest("aa", input: "aa\u{301}", match: nil) + + firstMatchTest("a", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + firstMatchTest("aa", input: "aa\u{301}", match: "aa", semanticLevel: .unicodeScalar) + + // case insensitive tests + firstMatchTest(#"(?i)abc\u{301}d"#, input: "AbC\u{301}d", match: "AbC\u{301}d", semanticLevel: .unicodeScalar) + } + func testCase() { let regex = try! Regex(#".\N{SPARKLING HEART}."#) let input = "๐ŸงŸโ€โ™€๏ธ๐Ÿ’–๐Ÿง  or ๐Ÿง ๐Ÿ’–โ˜•๏ธ" @@ -1943,5 +1970,31 @@ extension RegexTests { XCTAssertEqual(matches.count, 3) } } -} + func expectCompletion(regex: String, in target: String) { + let expectation = XCTestExpectation(description: "Run the given regex to completion") + Task.init { + let r = try! Regex(regex) + let val = target.matches(of: r).isEmpty + expectation.fulfill() + return val + } + wait(for: [expectation], timeout: 3.0) + } + + func testQuantificationForwardProgress() { + expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa") + expectCompletion(regex: #"(?:\b)+"#, in: "aa") + expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa") + expectCompletion(regex: #"(a*)*"#, in: "aa") + expectCompletion(regex: #"(a?)*"#, in: "aa") + expectCompletion(regex: #"(a{,4})*"#, in: "aa") + expectCompletion(regex: #"((|)+)*"#, in: "aa") + } +} diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index e33b10c31..3b0a8d5b3 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -148,4 +148,34 @@ extension RenderDSLTests { } """#) } + + func testScalar() throws { + try testConversion(#"\u{B4}"#, #""" + Regex { + "\u{B4}" + } + """#) + try testConversion(#"\u{301}"#, #""" + Regex { + "\u{301}" + } + """#) + try testConversion(#"[\u{301}]"#, #""" + Regex { + One(.anyOf("\u{301}")) + } + """#) + try testConversion(#"[abc\u{301}]"#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + // TODO: We ought to try and preserve the scalar syntax here. + try testConversion(#"a\u{301}"#, #""" + Regex { + "aฬ" + } + """#) + } }