diff --git a/Package.swift b/Package.swift index f8162e762..8303fc5cb 100644 --- a/Package.swift +++ b/Package.swift @@ -67,7 +67,7 @@ let package = Package( name: "RegexTests", dependencies: ["_StringProcessing"], swiftSettings: [ - .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) + .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 21fcfa703..2131d1eb5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -168,7 +168,15 @@ extension Compiler.ByteCodeGen { } mutating func emitCharacter(_ c: Character) throws { - // FIXME: Does semantic level matter? + // Unicode scalar matches the specific scalars that comprise a character + if options.semanticLevel == .unicodeScalar { + print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") + for scalar in c.unicodeScalars { + try emitScalar(scalar) + } + return + } + if options.isCaseInsensitive && c.isCased { // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) builder.buildConsume { input, bounds in @@ -625,22 +633,44 @@ extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - // TODO: Should this incorporate options? - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() + if options.semanticLevel == .graphemeCluster { + if options.isCaseInsensitive { + // TODO: buildCaseInsensitiveMatchSequence(c) or alternative + builder.buildConsume { input, bounds in + var iterator = s.makeIterator() + var currentIndex = bounds.lowerBound + while let ch = iterator.next() { + guard currentIndex < bounds.upperBound, + ch.lowercased() == input[currentIndex].lowercased() + else { return nil } + input.formIndex(after: ¤tIndex) + } + return currentIndex + } + } else { + builder.buildMatchSequence(s) + } + } else { + builder.buildConsume { + [caseInsensitive = options.isCaseInsensitive] input, bounds in + // TODO: Case folding + var iterator = s.unicodeScalars.makeIterator() var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) + while let scalar = iterator.next() { + guard currentIndex < bounds.upperBound else { return nil } + if caseInsensitive { + if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { + return nil + } + } else { + if scalar != input.unicodeScalars[currentIndex] { + return nil + } + } + input.unicodeScalars.formIndex(after: ¤tIndex) } return currentIndex } - } else { - builder.buildMatchSequence(s) } case let .regexLiteral(l): diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a44c2c876..d27b89314 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -111,6 +111,38 @@ extension DSLTree.Atom { } } +extension String { + /// Compares this string to `other` using the loose matching rule UAX44-LM2, + /// which ignores case, whitespace, underscores, and nearly all medial + /// hyphens. + /// + /// FIXME: Only ignore medial hyphens + /// FIXME: Special case for U+1180 HANGUL JUNGSEONG O-E + /// See https://www.unicode.org/reports/tr44/#Matching_Rules + fileprivate func isEqualByUAX44LM2(to other: String) -> Bool { + var index = startIndex + var otherIndex = other.startIndex + + while index < endIndex && otherIndex < other.endIndex { + if self[index].isWhitespace || self[index] == "-" || self[index] == "_" { + formIndex(after: &index) + continue + } + if other[otherIndex].isWhitespace || other[otherIndex] == "-" || other[otherIndex] == "_" { + other.formIndex(after: &otherIndex) + continue + } + + if self[index] != other[otherIndex] && self[index].lowercased() != other[otherIndex].lowercased() { + return false + } + + formIndex(after: &index) + other.formIndex(after: &otherIndex) + } + return index == endIndex && otherIndex == other.endIndex + } +} // TODO: This is basically an AST interpreter, which would // be good or interesting to build regardless, and serves @@ -131,6 +163,13 @@ extension AST.Atom { } } + var singleScalar: UnicodeScalar? { + switch kind { + case .scalar(let s): return s + default: return nil + } + } + func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { @@ -167,10 +206,12 @@ extension AST.Atom { return try p.generateConsumer(opts) case let .namedCharacter(name): - return consumeScalarProp { - // TODO: alias? casing? - $0.name == name || $0.nameAlias == name - } + return consumeScalar(propertyScalarPredicate { + // FIXME: name aliases not covered by $0.nameAlias are missed + // e.g. U+FEFF is also 'FORM FEED', 'BYTE ORDER MARK', and 'BOM' + $0.name?.isEqualByUAX44LM2(to: name) == true + || $0.nameAlias?.isEqualByUAX44LM2(to: name) == true + }) case .any: assertionFailure( @@ -312,8 +353,9 @@ extension DSLTree.CustomCharacterClass { } } if isInverted { - // FIXME: semantic level - return input.index(after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } return nil } @@ -321,38 +363,26 @@ extension DSLTree.CustomCharacterClass { } // NOTE: Conveniences, though not most performant -private func consumeScalarScript( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - Unicode.Script($0) == s - } +typealias ScalarPredicate = (UnicodeScalar) -> Bool + +private func scriptScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script($0) == s } } -private func consumeScalarScriptExtension( - _ s: Unicode.Script -) -> MEProgram.ConsumeFunction { - consumeScalar { - let extensions = Unicode.Script.extensions(for: $0) - return extensions.contains(s) - } +private func scriptExtensionScalarPredicate(_ s: Unicode.Script) -> ScalarPredicate { + { Unicode.Script.extensions(for: $0).contains(s) } } -private func consumeScalarGC( - _ gc: Unicode.GeneralCategory -) -> MEProgram.ConsumeFunction { - consumeScalar { gc == $0.properties.generalCategory } +private func categoryScalarPredicate(_ gc: Unicode.GeneralCategory) -> ScalarPredicate { + { gc == $0.properties.generalCategory } } -private func consumeScalarGCs( - _ gcs: [Unicode.GeneralCategory] -) -> MEProgram.ConsumeFunction { - consumeScalar { gcs.contains($0.properties.generalCategory) } +private func categoriesScalarPredicate(_ gcs: [Unicode.GeneralCategory]) -> ScalarPredicate { + { gcs.contains($0.properties.generalCategory) } } -private func consumeScalarProp( - _ p: @escaping (Unicode.Scalar.Properties) -> Bool -) -> MEProgram.ConsumeFunction { - consumeScalar { p($0.properties) } +private func propertyScalarPredicate(_ p: @escaping (Unicode.Scalar.Properties) -> Bool) -> ScalarPredicate { + { p($0.properties) } } + func consumeScalar( - _ p: @escaping (Unicode.Scalar) -> Bool + _ p: @escaping ScalarPredicate ) -> MEProgram.ConsumeFunction { { input, bounds in // TODO: bounds check? @@ -364,6 +394,37 @@ func consumeScalar( return nil } } +func consumeCharacterWithLeadingScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + if p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} +func consumeCharacterWithSingleScalar( + _ p: @escaping ScalarPredicate +) -> MEProgram.ConsumeFunction { + { input, bounds in + let curIdx = bounds.lowerBound + + if input[curIdx].hasExactlyOneScalar && p(input[curIdx].unicodeScalars.first!) { + return input.index(after: curIdx) + } + return nil + } +} + +func consumeFunction( + for opts: MatchingOptions +) -> (@escaping ScalarPredicate) -> MEProgram.ConsumeFunction { + opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithLeadingScalar + : consumeScalar +} extension AST.Atom.CharacterProperty { func generateConsumer( @@ -375,16 +436,15 @@ extension AST.Atom.CharacterProperty { ) -> MEProgram.ConsumeFunction { return { input, bounds in if p(input, bounds) != nil { return nil } - // TODO: semantic level + // TODO: bounds check - return input.unicodeScalars.index( - after: bounds.lowerBound) + return opts.semanticLevel == .graphemeCluster + ? input.index(after: bounds.lowerBound) + : input.unicodeScalars.index(after: bounds.lowerBound) } } - // FIXME: Below is largely scalar based, for convenience, - // but we want a comprehensive treatment to semantic mode - // switching. + let consume = consumeFunction(for: opts) let preInversion: MEProgram.ConsumeFunction = try { switch kind { @@ -395,11 +455,16 @@ extension AST.Atom.CharacterProperty { return input.index(after: bounds.lowerBound) } case .assigned: - return consumeScalar { + return consume { $0.properties.generalCategory != .unassigned } case .ascii: - return consumeScalar(\.isASCII) + // Note: ASCII must look at the whole character, not just the first + // scalar. That is, "e\u{301}" is not an ASCII character, even though + // the first scalar is. + return opts.semanticLevel == .graphemeCluster + ? consumeCharacterWithSingleScalar(\.isASCII) + : consumeScalar(\.isASCII) case .generalCategory(let p): return try p.generateConsumer(opts) @@ -410,10 +475,10 @@ extension AST.Atom.CharacterProperty { return value ? cons : invert(cons) case .script(let s): - return consumeScalarScript(s) + return consume(scriptScalarPredicate(s)) case .scriptExtension(let s): - return consumeScalarScriptExtension(s) + return consume(scriptExtensionScalarPredicate(s)) case .posix(let p): return p.generateConsumer(opts) @@ -436,49 +501,48 @@ extension Unicode.BinaryProperty { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { - case .asciiHexDigit: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isHexDigit && $0.isASCIIHexDigit - } + }) case .alphabetic: - return consumeScalarProp(\.isAlphabetic) + return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: break - - - case .bidiMirrored: - return consumeScalarProp(\.isBidiMirrored) + case .bidiMirrored: + return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: - return consumeScalarProp(\.isCased) + return consume(propertyScalarPredicate(\.isCased)) case .compositionExclusion: break case .caseIgnorable: - return consumeScalarProp(\.isCaseIgnorable) + return consume(propertyScalarPredicate(\.isCaseIgnorable)) case .changesWhenCasefolded: - return consumeScalarProp(\.changesWhenCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenCaseFolded)) case .changesWhenCasemapped: - return consumeScalarProp(\.changesWhenCaseMapped) + return consume(propertyScalarPredicate(\.changesWhenCaseMapped)) case .changesWhenNFKCCasefolded: - return consumeScalarProp(\.changesWhenNFKCCaseFolded) + return consume(propertyScalarPredicate(\.changesWhenNFKCCaseFolded)) case .changesWhenLowercased: - return consumeScalarProp(\.changesWhenLowercased) + return consume(propertyScalarPredicate(\.changesWhenLowercased)) case .changesWhenTitlecased: - return consumeScalarProp(\.changesWhenTitlecased) + return consume(propertyScalarPredicate(\.changesWhenTitlecased)) case .changesWhenUppercased: - return consumeScalarProp(\.changesWhenUppercased) + return consume(propertyScalarPredicate(\.changesWhenUppercased)) case .dash: - return consumeScalarProp(\.isDash) + return consume(propertyScalarPredicate(\.isDash)) case .deprecated: - return consumeScalarProp(\.isDeprecated) + return consume(propertyScalarPredicate(\.isDeprecated)) case .defaultIgnorableCodePoint: - return consumeScalarProp(\.isDefaultIgnorableCodePoint) + return consume(propertyScalarPredicate(\.isDefaultIgnorableCodePoint)) case .diacratic: // spelling? - return consumeScalarProp(\.isDiacritic) + return consume(propertyScalarPredicate(\.isDiacritic)) case .emojiModifierBase: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifierBase) + return consume(propertyScalarPredicate(\.isEmojiModifierBase)) } else { throw Unsupported( "isEmojiModifierBase on old OSes") @@ -487,59 +551,59 @@ extension Unicode.BinaryProperty { break case .emojiModifier: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiModifier) + return consume(propertyScalarPredicate(\.isEmojiModifier)) } else { throw Unsupported("isEmojiModifier on old OSes") } case .emoji: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmoji) + return consume(propertyScalarPredicate(\.isEmoji)) } else { throw Unsupported("isEmoji on old OSes") } case .emojiPresentation: if #available(macOS 10.12.2, iOS 10.2, tvOS 10.1, watchOS 3.1.1, *) { - return consumeScalarProp(\.isEmojiPresentation) + return consume(propertyScalarPredicate(\.isEmojiPresentation)) } else { throw Unsupported( "isEmojiPresentation on old OSes") } case .extender: - return consumeScalarProp(\.isExtender) + return consume(propertyScalarPredicate(\.isExtender)) case .extendedPictographic: break // NOTE: Stdlib has this data internally case .fullCompositionExclusion: - return consumeScalarProp(\.isFullCompositionExclusion) + return consume(propertyScalarPredicate(\.isFullCompositionExclusion)) case .graphemeBase: - return consumeScalarProp(\.isGraphemeBase) + return consume(propertyScalarPredicate(\.isGraphemeBase)) case .graphemeExtended: - return consumeScalarProp(\.isGraphemeExtend) + return consume(propertyScalarPredicate(\.isGraphemeExtend)) case .graphemeLink: break case .hexDigit: - return consumeScalarProp(\.isHexDigit) + return consume(propertyScalarPredicate(\.isHexDigit)) case .hyphen: break case .idContinue: - return consumeScalarProp(\.isIDContinue) + return consume(propertyScalarPredicate(\.isIDContinue)) case .ideographic: - return consumeScalarProp(\.isIdeographic) + return consume(propertyScalarPredicate(\.isIdeographic)) case .idStart: - return consumeScalarProp(\.isIDStart) + return consume(propertyScalarPredicate(\.isIDStart)) case .idsBinaryOperator: - return consumeScalarProp(\.isIDSBinaryOperator) + return consume(propertyScalarPredicate(\.isIDSBinaryOperator)) case .idsTrinaryOperator: - return consumeScalarProp(\.isIDSTrinaryOperator) + return consume(propertyScalarPredicate(\.isIDSTrinaryOperator)) case .joinControl: - return consumeScalarProp(\.isJoinControl) + return consume(propertyScalarPredicate(\.isJoinControl)) case .logicalOrderException: - return consumeScalarProp(\.isLogicalOrderException) + return consume(propertyScalarPredicate(\.isLogicalOrderException)) case .lowercase: - return consumeScalarProp(\.isLowercase) + return consume(propertyScalarPredicate(\.isLowercase)) case .math: - return consumeScalarProp(\.isMath) + return consume(propertyScalarPredicate(\.isMath)) case .noncharacterCodePoint: - return consumeScalarProp(\.isNoncharacterCodePoint) + return consume(propertyScalarPredicate(\.isNoncharacterCodePoint)) case .otherAlphabetic: break case .otherDefaultIgnorableCodePoint: @@ -557,37 +621,37 @@ extension Unicode.BinaryProperty { case .otherUppercase: break case .patternSyntax: - return consumeScalarProp(\.isPatternSyntax) + return consume(propertyScalarPredicate(\.isPatternSyntax)) case .patternWhitespace: - return consumeScalarProp(\.isPatternWhitespace) + return consume(propertyScalarPredicate(\.isPatternWhitespace)) case .prependedConcatenationMark: break case .quotationMark: - return consumeScalarProp(\.isQuotationMark) + return consume(propertyScalarPredicate(\.isQuotationMark)) case .radical: - return consumeScalarProp(\.isRadical) + return consume(propertyScalarPredicate(\.isRadical)) case .regionalIndicator: - return consumeScalar { s in + return consume { s in (0x1F1E6...0x1F1FF).contains(s.value) } case .softDotted: - return consumeScalarProp(\.isSoftDotted) + return consume(propertyScalarPredicate(\.isSoftDotted)) case .sentenceTerminal: - return consumeScalarProp(\.isSentenceTerminal) + return consume(propertyScalarPredicate(\.isSentenceTerminal)) case .terminalPunctuation: - return consumeScalarProp(\.isTerminalPunctuation) + return consume(propertyScalarPredicate(\.isTerminalPunctuation)) case .unifiedIdiograph: // spelling? - return consumeScalarProp(\.isUnifiedIdeograph) + return consume(propertyScalarPredicate(\.isUnifiedIdeograph)) case .uppercase: - return consumeScalarProp(\.isUppercase) + return consume(propertyScalarPredicate(\.isUppercase)) case .variationSelector: - return consumeScalarProp(\.isVariationSelector) + return consume(propertyScalarPredicate(\.isVariationSelector)) case .whitespace: - return consumeScalarProp(\.isWhitespace) + return consume(propertyScalarPredicate(\.isWhitespace)) case .xidContinue: - return consumeScalarProp(\.isXIDContinue) + return consume(propertyScalarPredicate(\.isXIDContinue)) case .xidStart: - return consumeScalarProp(\.isXIDStart) + return consume(propertyScalarPredicate(\.isXIDStart)) case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw Unsupported("Unicode-deprecated: \(self)") @@ -602,42 +666,44 @@ extension Unicode.POSIXProperty { func generateConsumer( _ opts: MatchingOptions ) -> MEProgram.ConsumeFunction { - // FIXME: semantic levels, modes, etc + let consume = consumeFunction(for: opts) + + // FIXME: modes, etc switch self { case .alnum: - return consumeScalarProp { + return consume(propertyScalarPredicate { $0.isAlphabetic || $0.numericType != nil - } + }) case .blank: - return consumeScalar { s in + return consume { s in s.properties.generalCategory == .spaceSeparator || s == "\t" } case .graph: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in !( p.isWhitespace || p.generalCategory == .control || p.generalCategory == .surrogate || p.generalCategory == .unassigned ) - } + }) case .print: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.generalCategory != .control - } + }) case .word: - return consumeScalarProp { p in + return consume(propertyScalarPredicate { p in // FIXME: better def p.isAlphabetic || p.numericType != nil || p.isJoinControl || p.isDash// marks and connectors... - } + }) case .xdigit: - return consumeScalarProp(\.isHexDigit) // or number + return consume(propertyScalarPredicate(\.isHexDigit)) // or number } } @@ -648,113 +714,115 @@ extension Unicode.ExtendedGeneralCategory { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction { + let consume = consumeFunction(for: opts) + switch self { case .letter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter, .modifierLetter, .otherLetter - ]) + ])) case .mark: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .nonspacingMark, .spacingMark, .enclosingMark - ]) + ])) case .number: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .decimalNumber, .letterNumber, .otherNumber - ]) + ])) case .symbol: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol - ]) + ])) case .punctuation: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .connectorPunctuation, .dashPunctuation, .openPunctuation, .closePunctuation, .initialPunctuation, .finalPunctuation, .otherPunctuation - ]) + ])) case .separator: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .spaceSeparator, .lineSeparator, .paragraphSeparator - ]) + ])) case .other: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .control, .format, .surrogate, .privateUse, .unassigned - ]) + ])) case .casedLetter: - return consumeScalarGCs([ + return consume(categoriesScalarPredicate([ .uppercaseLetter, .lowercaseLetter, .titlecaseLetter - ]) + ])) case .control: - return consumeScalarGC(.control) + return consume(categoryScalarPredicate(.control)) case .format: - return consumeScalarGC(.format) + return consume(categoryScalarPredicate(.format)) case .unassigned: - return consumeScalarGC(.unassigned) + return consume(categoryScalarPredicate(.unassigned)) case .privateUse: - return consumeScalarGC(.privateUse) + return consume(categoryScalarPredicate(.privateUse)) case .surrogate: - return consumeScalarGC(.surrogate) + return consume(categoryScalarPredicate(.surrogate)) case .lowercaseLetter: - return consumeScalarGC(.lowercaseLetter) + return consume(categoryScalarPredicate(.lowercaseLetter)) case .modifierLetter: - return consumeScalarGC(.modifierLetter) + return consume(categoryScalarPredicate(.modifierLetter)) case .otherLetter: - return consumeScalarGC(.otherLetter) + return consume(categoryScalarPredicate(.otherLetter)) case .titlecaseLetter: - return consumeScalarGC(.titlecaseLetter) + return consume(categoryScalarPredicate(.titlecaseLetter)) case .uppercaseLetter: - return consumeScalarGC(.uppercaseLetter) + return consume(categoryScalarPredicate(.uppercaseLetter)) case .spacingMark: - return consumeScalarGC(.spacingMark) + return consume(categoryScalarPredicate(.spacingMark)) case .enclosingMark: - return consumeScalarGC(.enclosingMark) + return consume(categoryScalarPredicate(.enclosingMark)) case .nonspacingMark: - return consumeScalarGC(.nonspacingMark) + return consume(categoryScalarPredicate(.nonspacingMark)) case .decimalNumber: - return consumeScalarGC(.decimalNumber) + return consume(categoryScalarPredicate(.decimalNumber)) case .letterNumber: - return consumeScalarGC(.letterNumber) + return consume(categoryScalarPredicate(.letterNumber)) case .otherNumber: - return consumeScalarGC(.otherNumber) + return consume(categoryScalarPredicate(.otherNumber)) case .connectorPunctuation: - return consumeScalarGC(.connectorPunctuation) + return consume(categoryScalarPredicate(.connectorPunctuation)) case .dashPunctuation: - return consumeScalarGC(.dashPunctuation) + return consume(categoryScalarPredicate(.dashPunctuation)) case .closePunctuation: - return consumeScalarGC(.closePunctuation) + return consume(categoryScalarPredicate(.closePunctuation)) case .finalPunctuation: - return consumeScalarGC(.finalPunctuation) + return consume(categoryScalarPredicate(.finalPunctuation)) case .initialPunctuation: - return consumeScalarGC(.initialPunctuation) + return consume(categoryScalarPredicate(.initialPunctuation)) case .otherPunctuation: - return consumeScalarGC(.otherPunctuation) + return consume(categoryScalarPredicate(.otherPunctuation)) case .openPunctuation: - return consumeScalarGC(.openPunctuation) + return consume(categoryScalarPredicate(.openPunctuation)) case .currencySymbol: - return consumeScalarGC(.currencySymbol) + return consume(categoryScalarPredicate(.currencySymbol)) case .modifierSymbol: - return consumeScalarGC(.modifierSymbol) + return consume(categoryScalarPredicate(.modifierSymbol)) case .mathSymbol: - return consumeScalarGC(.mathSymbol) + return consume(categoryScalarPredicate(.mathSymbol)) case .otherSymbol: - return consumeScalarGC(.otherSymbol) + return consume(categoryScalarPredicate(.otherSymbol)) case .lineSeparator: - return consumeScalarGC(.lineSeparator) + return consume(categoryScalarPredicate(.lineSeparator)) case .paragraphSeparator: - return consumeScalarGC(.paragraphSeparator) + return consume(categoryScalarPredicate(.paragraphSeparator)) case .spaceSeparator: - return consumeScalarGC(.spaceSeparator) + return consume(categoryScalarPredicate(.spaceSeparator)) } } } diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index ef98a7b8f..47433dc42 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -65,13 +65,17 @@ extension AST.Node { // TODO: For printing, nice to coalesce // scalars literals too. We likely need a different // approach even before we have a better IR. - guard let char = atom?.singleCharacter else { + if let char = atom?.singleCharacter { + result.append(char) + } else if let scalar = atom?.singleScalar { + result.append(Character(scalar)) + } else { break } - result.append(char) + astChildren.formIndex(after: &idx) } - return result.count <= 1 ? nil : (idx, result) + return result.isEmpty ? nil : (idx, result) } // No need to nest single children concatenations @@ -207,7 +211,7 @@ extension AST.Atom { switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .scalar(s) + case let .scalar(s): return .char(Character(s)) case .any: return .any case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index cfa68c425..80f6819a6 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -12,3 +12,9 @@ // TODO +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + var hasExactlyOneScalar: Bool { + unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex + } +} diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 4d0c12c1f..fc3fd5741 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -194,11 +194,14 @@ public struct _CharacterClassModel: Hashable { return matched ? next : nil case .unicodeScalar: let c = str.unicodeScalars[i] + var nextIndex = str.unicodeScalars.index(after: i) var matched: Bool switch cc { case .any: matched = true case .anyScalar: matched = true - case .anyGrapheme: fatalError("Not matched in this mode") + case .anyGrapheme: + matched = true + nextIndex = str.index(after: i) case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: @@ -215,7 +218,7 @@ public struct _CharacterClassModel: Hashable { if isInverted { matched.toggle() } - return matched ? str.unicodeScalars.index(after: i) : nil + return matched ? nextIndex : nil } } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 2c6b858cc..83b73fe35 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -169,6 +169,8 @@ func firstMatchTest( XCTAssertEqual(found, match, file: file, line: line) } } catch { + // FIXME: This allows non-matches to succeed even when xfail'd + // When xfail == true, this should report failure for match == nil if !xfail && match != nil { XCTFail("\(error)", file: file, line: line) } @@ -182,7 +184,9 @@ func firstMatchTests( syntax: SyntaxOptions = .traditional, enableTracing: Bool = false, dumpAST: Bool = false, - xfail: Bool = false + xfail: Bool = false, + file: StaticString = #filePath, + line: UInt = #line ) { for (input, match) in tests { firstMatchTest( @@ -192,7 +196,9 @@ func firstMatchTests( syntax: syntax, enableTracing: enableTracing, dumpAST: dumpAST, - xfail: xfail) + xfail: xfail, + file: file, + line: line) } } @@ -400,7 +406,8 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil)) + ("bb", nil), + xfail: true) firstMatchTests( "a+?a", ("babc", nil), @@ -462,15 +469,11 @@ extension RegexTests { "a{2,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) firstMatchTests( "a{,4}+a", - ("babc", nil), - ("baabc", nil), - ("baaabc", nil), ("baaaaabc", "aaaaa"), ("baaaaaaaabc", "aaaaa"), ("bb", nil)) @@ -478,11 +481,44 @@ extension RegexTests { "a{2,}+a", ("babc", nil), ("baabc", nil), + ("bb", nil)) + + // XFAIL'd versions of the above + firstMatchTests( + "a{2,4}+a", + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{,4}+a", + ("babc", nil), + ("baabc", nil), + ("baaabc", nil), + xfail: true) + firstMatchTests( + "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), ("baaaaaaaabc", nil), - ("bb", nil)) + xfail: true) + // XFAIL'd possessive tests + firstMatchTests( + "a?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a)?+a", + ("a", nil), + xfail: true) + firstMatchTests( + "(a|a){2,4}+a", + ("a", nil), + ("aa", nil)) + firstMatchTests( + "(a|a){2,4}+a", + ("aaa", nil), + ("aaaa", nil), + xfail: true) firstMatchTests( "(?:a{2,4}?b)+", @@ -946,15 +982,19 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( - #"\u{65}"#, // Scalar 'e' is present in both: - ("Cafe\u{301}", "e"), // composed and - ("Sol Cafe", "e")) // standalone + #"\u{65}"#, // Scalar 'e' is present in both + ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match + xfail: true) + firstMatchTests( + #"\u{65}"#, // Scalar 'e' is present in both + ("Sol Cafe", "e")) // standalone is okay + firstMatchTests( #"\u{65}\y"#, // Grapheme boundary assertion ("Cafe\u{301}", nil), ("Sol Cafe", "e")) firstMatchTests( - #"\u{65}\Y"#, // Grapheme non-boundary assertion + #"(?u)\u{65}\Y"#, // Grapheme non-boundary assertion ("Cafe\u{301}", "e"), ("Sol Cafe", nil)) } @@ -1361,11 +1401,11 @@ extension RegexTests { // as a character. firstMatchTest(#"\u{65}\u{301}$"#, input: eDecomposed, match: eDecomposed) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character - firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed, - xfail: true) + firstMatchTest(#"\u{65}\u{301}$"#, input: eComposed, match: eComposed) - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: "e") + // FIXME: Implicit \y at end of match + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, + xfail: true) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) // FIXME: \y is unsupported firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, @@ -1389,12 +1429,10 @@ extension RegexTests { (eComposed, true), (eDecomposed, true)) - // FIXME: Decomposed character in regex literal doesn't match an equivalent character matchTest( #"e\u{301}$"#, (eComposed, true), - (eDecomposed, true), - xfail: true) + (eDecomposed, true)) matchTest( #"e$"#, @@ -1415,9 +1453,7 @@ extension RegexTests { (eDecomposed, true)) // \p{Letter} firstMatchTest(#"\p{Letter}$"#, input: eComposed, match: eComposed) - // FIXME: \p{Letter} doesn't match a decomposed character - firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"\p{Letter}$"#, input: eDecomposed, match: eDecomposed) // \d firstMatchTest(#"\d"#, input: "5", match: "5") @@ -1480,7 +1516,8 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0}\u{1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag) + firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, + xfail: true) // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character // A CCC of regional indicators x 2 @@ -1521,8 +1558,7 @@ extension RegexTests { // FIXME: \O is unsupported firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) - firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed) firstMatchTest(#"\O"#, input: eComposed, match: eComposed) firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift new file mode 100644 index 000000000..71f459a1b --- /dev/null +++ b/Tests/RegexTests/UTS18Tests.swift @@ -0,0 +1,589 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +// This test suite includes tests that verify the behavior of `Regex` as it +// relates to Unicode Technical Standard #18: Unicode Regular Expressions. +// +// Please note: Quotations of UTS18 in this file mostly use 'Character' to mean +// Unicode code point, and 'String' to mean 'sequence of code points' — they +// are not the Swift meanings of those terms. +// +// See https://unicode.org/reports/tr18/ for more. + +import XCTest +@testable // for internal `matches(of:)` +import _StringProcessing + +class UTS18Tests: XCTestCase { + var input: String { + "ABCdefghîøu\u{308}\u{FFF0} -–—[]123" + // 01234567890 1 234567890 + // 0 10 20 + } +} + +fileprivate func regex(_ pattern: String) -> Regex { + try! Regex(pattern, as: Substring.self) +} + +fileprivate extension String { + subscript(pos bounds: R) -> Substring + where R.Bound == Int + { + let bounds = bounds.relative(to: 0..( + _ input: String, + _ r: Regex, + _ output: Output, + file: StaticString = #file, + line: UInt = #line) +{ + XCTAssertEqual(input.firstMatch(of: r)?.output, output, file: file, line: line) +} + +#if os(Linux) +func XCTExpectFailure(_ message: String? = nil, body: () -> Void) {} +#endif + +// MARK: - Basic Unicode Support: Level 1 + +// C1. An implementation claiming conformance to Level 1 of this specification +// shall meet the requirements described in the following sections: +extension UTS18Tests { + // RL1.1 Hex Notation + // + // To meet this requirement, an implementation shall supply a mechanism for + // specifying any Unicode code point (from U+0000 to U+10FFFF), using the + // hexadecimal code point representation. + func testHexNotation() { + expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") + expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + } + + // 1.1.1 Hex Notation and Normalization + // + // TODO: Does this section make a recommendation? + + // RL1.2 Properties + // To meet this requirement, an implementation shall provide at least a + // minimal list of properties, consisting of the following: + // - General_Category + // - Script and Script_Extensions + // - Alphabetic + // - Uppercase + // - Lowercase + // - White_Space + // - Noncharacter_Code_Point + // - Default_Ignorable_Code_Point + // - ANY, ASCII, ASSIGNED + // The values for these properties must follow the Unicode definitions, and + // include the property and property value aliases from the UCD. Matching of + // Binary, Enumerated, Catalog, and Name values must follow the Matching + // Rules from [UAX44] with one exception: implementations are not required + // to ignore an initial prefix string of "is" in property values. + func testProperties() { + // General_Category + expectFirstMatch(input, regex(#"\p{Lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lu}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercase letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase_Letter}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{uppercaseletter}+"#), input[pos: ..<3]) + + expectFirstMatch(input, regex(#"\p{P}+"#), "-–—[]") + expectFirstMatch(input, regex(#"\p{Pd}+"#), "-–—") + + expectFirstMatch(input, regex(#"\p{Any}+"#), input[...]) + expectFirstMatch(input, regex(#"\p{Assigned}+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"\p{ASCII}+"#), input[pos: ..<8]) + + // Script and Script_Extensions + // U+3042 あ HIRAGANA LETTER A Hira {Hira} + XCTAssertTrue("\u{3042}".contains(regex(#"\p{Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertTrue("\u{3042}".contains(regex(#"\p{scx=Hira}"#))) + // U+30FC ー KATAKANA-HIRAGANA PROLONGED SOUND MARK Zyyy = Common {Hira, Kana} + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Hira}"#))) // Implicit = Script_Extensions + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{Kana}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{sc=Zyyy}"#))) // Explicit = Script + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Hira}"#))) + XCTAssertTrue("\u{30FC}".contains(regex(#"\p{scx=Kana}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Hira}"#))) + XCTAssertFalse("\u{30FC}".contains(regex(#"\p{sc=Kana}"#))) + + // Uppercase, etc + expectFirstMatch(input, regex(#"\p{Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{isUppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{Uppercase=true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is Uppercase}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{is uppercase = true}+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"\p{lowercase}+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"\p{whitespace}+"#), input[pos: 12..<13]) + + // Block vs Writing System + let greekScalar = "Θ" // U+0398 + let greekExtendedScalar = "ἀ" // U+1F00 + XCTAssertTrue(greekScalar.contains(regex(#"\p{Greek}"#))) + XCTAssertTrue(greekExtendedScalar.contains(regex(#"\p{Greek}"#))) + } + + func testProperties_XFail() { + XCTExpectFailure("Need to support 'age' and 'block' properties") { + // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) + XCTFail(#"\(#/\p{age=3.1}/#)"#) + // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) + XCTFail(#"\(#/\p{Block=Greek}/#)"#) + } + } + + // RL1.2a Compatibility Properties + // To meet this requirement, an implementation shall provide the properties + // listed in Annex C: Compatibility Properties, with the property values as + // listed there. Such an implementation shall document whether it is using + // the Standard Recommendation or POSIX-compatible properties. + func testCompatibilityProperties() throws { + // FIXME: These tests seem insufficient + expectFirstMatch(input, regex(#"[[:alpha:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:upper:]]+"#), input[pos: ..<3]) + expectFirstMatch(input, regex(#"[[:lower:]]+"#), input[pos: 3..<11]) + expectFirstMatch(input, regex(#"[[:punct:]]+"#), input[pos: 13..<18]) + expectFirstMatch(input, regex(#"[[:digit:]]+"#), input[pos: 18..<21]) + expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) + expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) + // TODO: blank + // TODO: cntrl + expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) + expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) + expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + } + + //RL1.3 Subtraction and Intersection + // + // To meet this requirement, an implementation shall supply mechanisms for + // union, intersection and set-difference of sets of characters within + // regular expression character class expressions. + func testSubtractionAndIntersection() throws { + // Non-ASCII letters + expectFirstMatch(input, regex(#"[\p{Letter}--\p{ASCII}]+"#), input[pos: 8..<11]) + // Digits that aren't 1 or 2 + expectFirstMatch(input, regex(#"[\p{digit}--[12]]+"#), input[pos: 20..<21]) + + // ASCII-only letters + expectFirstMatch(input, regex(#"[\p{Letter}&&\p{ASCII}]+"#), input[pos: ..<8]) + // Digits that are 2 or 3 + expectFirstMatch(input, regex(#"[\p{digit}&&[23]]+"#), input[pos: 19..<21]) + + // Non-ASCII lowercase + non-lowercase ASCII + expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) + XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + } + + func testSubtractionAndIntersectionPrecedence() { + expectFirstMatch("ABC123-", regex(#"[[:alnum:]]*-"#), "ABC123-") + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}]*-"#), "123-") + // Union binds more closely than difference + expectFirstMatch("ABC123-", regex(#"[[:alnum:]--\p{Uppercase}[:digit:]]*-"#), "-") + // TODO: Test for intersection precedence + } + + // RL1.4 Simple Word Boundaries + // To meet this requirement, an implementation shall extend the word boundary + // mechanism so that: + // - The class of includes all the Alphabetic values from the + // Unicode character database, from UnicodeData.txt, plus the decimals + // (General_Category=Decimal_Number, or equivalently Numeric_Type=Decimal), + // and the U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER + // (Join_Control=True). See also Annex C: Compatibility Properties. + // - Nonspacing marks are never divided from their base characters, and + // otherwise ignored in locating boundaries. + func testSimpleWordBoundaries() { + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) + expectFirstMatch("don't", simpleWordRegex, "don") + expectFirstMatch("Cafe\u{301}", simpleWordRegex, "Café") + } + + // RL1.5 Simple Loose Matches + // + // To meet this requirement, if an implementation provides for case- + // insensitive matching, then it shall provide at least the simple, default + // Unicode case-insensitive matching, and specify which properties are closed + // and which are not. + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the simple, default Unicode + // case folding. + func testSimpleLooseMatches() { + expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb") + expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB") + expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B") + } + + func testSimpleLooseMatches_XFail() { + XCTExpectFailure("Need case folding support") { + let sigmas = "σΣς" + expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...]) + expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...]) + + // TODO: Test German sharp S + // TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]] + } + } + + // RL1.6 Line Boundaries + // + // To meet this requirement, if an implementation provides for line-boundary + // testing, it shall recognize not only CRLF, LF, CR, but also NEL (U+0085), + // PARAGRAPH SEPARATOR (U+2029) and LINE SEPARATOR (U+2028). + func testLineBoundaries() { + let lineInput = """ + 01 + 02\r\ + 03\n\ + 04\u{a}\ + 05\u{b}\ + 06\u{c}\ + 07\u{d}\ + 08\u{d}\u{a}\ + 09\u{85}\ + 10\u{2028}\ + 11\u{2029}\ + + """ + // Check the input counts + var lines = lineInput.matches(of: regex(#"\d{2}"#)) + XCTAssertEqual(lines.count, 11) + // Test \R - newline sequence + lines = lineInput.matches(of: regex(#"\d{2}\R"#)) + XCTAssertEqual(lines.count, 11) + // Test anchors as line boundaries + lines = lineInput.matches(of: regex(#"^\d{2}$"#).anchorsMatchLineEndings()) + XCTAssertEqual(lines.count, 11) + // Test that dot does not match line endings + lines = lineInput.matches(of: regex(#".+"#)) + XCTAssertEqual(lines.count, 11) + + // Does not contain an empty line + XCTAssertFalse(lineInput.contains(regex(#"^$"#))) + // Does contain an empty line (between \n and \r, which are reversed here) + let empty = "\n\r" + XCTAssertTrue(empty.contains(regex(#"^$"#).anchorsMatchLineEndings())) + } + + // RL1.7 Supplementary Code Points + // + // To meet this requirement, an implementation shall handle the full range of + // Unicode code points, including values from U+FFFF to U+10FFFF. In + // particular, where UTF-16 is used, a sequence consisting of a leading + // surrogate followed by a trailing surrogate shall be handled as a single + // code point in matching. + func testSupplementaryCodePoints() { + XCTAssertTrue("👍".contains(regex(#"\u{1F44D}"#))) + XCTAssertTrue("👍".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) + XCTAssertTrue("👍👎".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) + } +} + +// MARK: - Extended Unicode Support: Level 2 + +// C2. An implementation claiming conformance to Level 2 of this specification +// shall satisfy C1, and meet the requirements described in the following +// sections: +extension UTS18Tests { + // RL2.1 Canonical Equivalents + // + // Specific recommendation? + func testCanonicalEquivalents() { + let equivalents = [ + "\u{006f}\u{031b}\u{0323}", // o + horn + dot_below + "\u{006f}\u{0323}\u{031b}", // o + dot_below + horn + "\u{01a1}\u{0323}", // o-horn + dot_below + "\u{1ecd}\u{031b}", // o-dot_below + horn + "\u{1ee3}", // o-horn-dot_below + ] + + let regexes = [ + regex(#"\u{006f}\u{031b}\u{0323}"#), // o + horn + dot_below + regex(#"\u{006f}\u{0323}\u{031b}"#), // o + dot_below + horn + regex(#"\u{01a1}\u{0323}"#), // o-horn + dot_below + regex(#"\u{1ecd}\u{031b}"#), // o-dot_below + horn + regex(#"\u{1ee3}"#), // o-horn-dot_below + ] + + // Default: Grapheme cluster semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + XCTAssertTrue( + equiv.contains(regex), + "Grapheme cluster semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } + } + + // Unicode scalar semantics + for (regexNum, regex) in regexes.enumerated() { + for (equivNum, equiv) in equivalents.enumerated() { + let regex = regex.matchingSemantics(.unicodeScalar) + if regexNum == equivNum { + XCTAssertTrue( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) didn't match with string \(equivNum)") + } else { + XCTAssertFalse( + equiv.contains(regex), + "Unicode scalar semantics: Regex \(regexNum) incorrectly matched with string \(equivNum)") + } + } + } + } + + // RL2.2 Extended Grapheme Clusters and Character Classes with Strings + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching against an arbitrary extended grapheme cluster, Character Classes + // with Strings, and extended grapheme cluster boundaries. + func testExtendedGraphemeClusters() { + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) + XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + } + + func testCharacterClassesWithStrings() { + let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) + XCTAssertTrue("🧐".contains(regex)) + XCTAssertTrue("🇧🇫".contains(regex)) + } + + // RL2.3 Default Word Boundaries + // + // To meet this requirement, an implementation shall provide a mechanism for + // matching Unicode default word boundaries. + func testDefaultWordBoundaries() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.4 Default Case Conversion + // + // To meet this requirement, if an implementation provides for case + // conversions, then it shall provide at least the full, default Unicode case + // folding. + func testDefaultCaseConversion() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.5 Name Properties + // + // To meet this requirement, an implementation shall support individually + // named characters. + func testNameProperty_XFail() { + XCTExpectFailure("Need \\p{name=...} support") { + XCTFail(#"\(#/\p{name=BOM}/#)"#) + // Name property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=ZERO WIDTH NO-BREAK SPACE}/#)) + // Name property and Matching Rules + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=zerowidthno breakspace}/#)) + // Name_Alias property + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BYTE ORDER MARK}/#)) + // Name_Alias property (again) + // XCTAssertTrue("\u{FEFF}".contains(#/\p{name=BOM}/#)) + + // Computed name + // XCTAssertTrue("강".contains(#/\p{name=HANGUL SYLLABLE GANG}/#)) + + // Control character + // XCTAssertTrue("\u{7}".contains(#/\p{name=BEL}/#)) + // Graphic symbol + // XCTAssertTrue("\u{1F514}".contains(#/\p{name=BELL}/#)) + } + } + + func testIndividuallyNamedCharacters() { + XCTAssertTrue("\u{263A}".contains(regex(#"\N{WHITE SMILING FACE}"#))) + XCTAssertTrue("\u{3B1}".contains(regex(#"\N{GREEK SMALL LETTER ALPHA}"#))) + XCTAssertTrue("\u{10450}".contains(regex(#"\N{SHAVIAN LETTER PEEP}"#))) + + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{ZERO WIDTH NO-BREAK SPACE}"#))) + XCTAssertTrue("강".contains(regex(#"\N{HANGUL SYLLABLE GANG}"#))) + XCTAssertTrue("\u{1F514}".contains(regex(#"\N{BELL}"#))) + XCTAssertTrue("🐯".contains(regex(#"\N{TIGER FACE}"#))) + XCTAssertFalse("🐯".contains(regex(#"\N{TIEGR FACE}"#))) + + // Loose matching + XCTAssertTrue("\u{263A}".contains(regex(#"\N{whitesmilingface}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{wHiTe_sMiLiNg_fAcE}"#))) + XCTAssertTrue("\u{263A}".contains(regex(#"\N{White Smiling-Face}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{zerowidthno breakspace}"#))) + + // Matching semantic level + XCTAssertFalse("👩‍👩‍👧‍👦".contains(regex(#".\N{ZERO WIDTH JOINER}"#))) + XCTAssertTrue("👩‍👩‍👧‍👦".contains(regex(#"(?u).\N{ZERO WIDTH JOINER}"#))) + } + + func testIndividuallyNamedCharacters_XFail() { + XCTExpectFailure("Need to support named chars in custom character classes") { + XCTFail("\(regex(#"[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+"#))") + // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) + } + + XCTExpectFailure("Other named char failures -- investigate") { + XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) + XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) + XCTAssertTrue("\u{7}".contains(regex(#"\N{BEL}"#))) + } + + XCTExpectFailure("Need to recognize invalid names at compile time") { + XCTFail("This should be a compilation error, not a match failure:") + XCTAssertFalse("abc".contains(regex(#"\N{NOT AN ACTUAL CHARACTER NAME}"#))) + } + } + + // RL2.6 Wildcards in Property Values + // + // To meet this requirement, an implementation shall support wildcards in + // Unicode property values. + func testWildcardsInPropertyValues() { + XCTExpectFailure { XCTFail("Implement tests") } + } + + // RL2.7 Full Properties + // + // To meet this requirement, an implementation shall support all of the + // properties listed below that are in the supported version of the Unicode + // Standard (or Unicode Technical Standard, respectively), with values that + // match the Unicode definitions for that version. + func testFullProperties() { + // MARK: General + // Name (Name_Alias) + // Block + // Age + // General_Category + // Script (Script_Extensions) + // White_Space + // Alphabetic + // Hangul_Syllable_Type + // Noncharacter_Code_Point + // Default_Ignorable_Code_Point + // Deprecated + // Logical_Order_Exception + // Variation_Selector + + // MARK: Numeric + // Numeric_Value + // Numeric_Type + // Hex_Digit + // ASCII_Hex_Digit + + // MARK: Identifiers + // ID_Continue + // ID_Start + // XID_Continue + // XID_Start + // Pattern_Syntax + // Pattern_White_Space + // Identifier_Status + // Identifier_Type + + // MARK: CJK + // Ideographic + // Unified_Ideograph + // Radical + // IDS_Binary_Operator + // IDS_Trinary_Operator + // Equivalent_Unified_Ideograph + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) + // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) + } + + // MARK: Case + // Uppercase + // Lowercase + // Simple_Lowercase_Mapping + // Simple_Titlecase_Mapping + // Simple_Uppercase_Mapping + // Simple_Case_Folding + // Soft_Dotted + // Cased + // Case_Ignorable + // Changes_When_Lowercased + // Changes_When_Uppercased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + // Changes_When_Casefolded + // Changes_When_Casemapped + + // MARK: Normalization + // Canonical_Combining_Class + // Decomposition_Type + // NFC_Quick_Check + // NFKC_Quick_Check + // NFD_Quick_Check + // NFKD_Quick_Check + // NFKC_Casefold + // Changes_When_NFKC_Casefolded + + // MARK: Emoji + // Emoji + // Emoji_Presentation + // Emoji_Modifier + // Emoji_Modifier_Base + // Emoji_Component + // Extended_Pictographic + // Basic_Emoji* + // Emoji_Keycap_Sequence* + // RGI_Emoji_Modifier_Sequence* + // RGI_Emoji_Flag_Sequence* + // RGI_Emoji_Tag_Sequence* + // RGI_Emoji_ZWJ_Sequence* + // RGI_Emoji* + + // MARK: Shaping and Rendering + // Join_Control + // Joining_Group + // Joining_Type + // Vertical_Orientation + // Line_Break + // Grapheme_Cluster_Break + // Sentence_Break + // Word_Break + // East_Asian_Width + // Prepended_Concatenation_Mark + + // MARK: Bidirectional + // Bidi_Class + // Bidi_Control + // Bidi_Mirrored + // Bidi_Mirroring_Glyph + // Bidi_Paired_Bracket + // Bidi_Paired_Bracket_Type + + // MARK: Miscellaneous + // Math + // Quotation_Mark + // Dash + // Sentence_Terminal + // Terminal_Punctuation + // Diacritic + // Extender + // Grapheme_Base + // Grapheme_Extend + // Regional_Indicator + } +}