From 4421822b457a847e75375ce5c427c5a9f6507f09 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 7 May 2022 17:44:32 -0500 Subject: [PATCH 01/10] UTS#18 test coverage for more properties --- Tests/RegexTests/UTS18Tests.swift | 36 ++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index eff9f9b4e..2d01beaf9 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -70,6 +70,9 @@ extension UTS18Tests { func testHexNotation() { expectFirstMatch("ab", regex(#"\u{61}\u{62}"#), "ab") expectFirstMatch("𝄞", regex(#"\u{1D11E}"#), "𝄞") + expectFirstMatch("\n", regex(#"\u{0A}"#), "\n") + expectFirstMatch("\r", regex(#"\u{0D}"#), "\r") + expectFirstMatch("\r\n", regex(#"\u{0D}\u{0A}"#), "\r\n") } // 1.1.1 Hex Notation and Normalization @@ -188,7 +191,7 @@ extension UTS18Tests { // Non-ASCII lowercase + non-lowercase ASCII expectFirstMatch(input, regex(#"[\p{lowercase}~~\p{ascii}]+"#), input[pos: ..<3]) - XCTAssertTrue("123%&^ABC".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) + XCTAssertTrue("123%&^ABCDéîøü".contains(regex(#"^[\p{lowercase}~~\p{ascii}]+$"#))) } func testSubtractionAndIntersectionPrecedence() { @@ -478,10 +481,20 @@ extension UTS18Tests { func testFullProperties() { // MARK: General // Name (Name_Alias) + XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#))) + // Block + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/^\p{block=Block Elements}+$/#)"#) + // XCTAssertTrue("▂▃▄▅▆▇".contains(regex(#"^\p{block=Block Elements}+$"#))) + } + // Age // General_Category // Script (Script_Extensions) + XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#))) + XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#))) + // White_Space // Alphabetic // Hangul_Syllable_Type @@ -528,15 +541,36 @@ extension UTS18Tests { // Simple_Case_Folding // Soft_Dotted // Cased + XCTAssertTrue("A".contains(regex(#"\p{Cased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#))) + XCTAssertFalse("0".contains(regex(#"\p{Cased}"#))) + // Case_Ignorable + XCTAssertTrue(":".contains(regex(#"\p{Case_Ignorable}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Case_Ignorable}"#))) + // Changes_When_Lowercased + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased}"#))) + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Lowercased=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Lowercased}"#))) + // Changes_When_Uppercased XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased}"#))) XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Uppercased=true}"#))) XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Uppercased}"#))) + // Changes_When_Titlecased + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Titlecased=true}"#))) + XCTAssertFalse("A".contains(regex(#"\p{Changes_When_Titlecased}"#))) + // Changes_When_Casefolded + XCTAssertTrue("A".contains(regex(#"\p{Changes_When_Casefolded=true}"#))) + XCTAssertFalse("a".contains(regex(#"\p{Changes_When_Casefolded}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casefolded}"#))) + // Changes_When_Casemapped + XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#))) + XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#))) // MARK: Normalization // Canonical_Combining_Class From 90710014607f5bc430c93d897e181ec3b11f3041 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 7 May 2022 17:52:23 -0500 Subject: [PATCH 02/10] Add support for `\p{age=...}` --- Sources/_RegexParser/Regex/AST/Atom.swift | 3 +++ .../CharacterPropertyClassification.swift | 25 +++++++++++++++++++ .../_StringProcessing/ConsumerInterface.swift | 11 +++++--- Tests/RegexTests/UTS18Tests.swift | 16 +++++++++--- 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index d6062115a..f31a57fc9 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -399,6 +399,9 @@ extension AST.Atom.CharacterProperty { /// Character name in the form `\p{name=...}` case named(String) + /// Character age, as per UnicodeScalar.Properties.age. + case age(major: Int, minor: Int) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index ee9195ff3..f82666282 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -361,6 +361,27 @@ extension Source { } } } + + static func parseAge(_ value: String) -> Unicode.Version? { + // Age can be specified in the form '3.0' or 'V3_0'. + // Other formats are not supported. + var str = value[...] + + let separator: Character + if str.first == "V" { + str.removeFirst() + separator = "_" + } else { + separator = "." + } + + guard let sepIndex = str.firstIndex(of: separator), + let major = Int(str[.. MEProgram.ConsumeFunction { - let consume = opts.semanticLevel == .graphemeCluster - ? consumeCharacterWithSingleScalar - : consumeScalar - + let consume = consumeFunction(for: opts) return consume(propertyScalarPredicate { // FIXME: name aliases not covered by $0.nameAlias are missed // e.g. U+FEFF has both 'BYTE ORDER MARK' and 'BOM' as aliases @@ -491,6 +488,12 @@ extension AST.Atom.CharacterProperty { case .named(let n): return consumeName(n, opts: opts) + case .age(let major, let minor): + return consume { + guard let age = $0.properties.age else { return false } + return age <= (major, minor) + } + case .posix(let p): return p.generateConsumer(opts) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 2d01beaf9..69b58c24e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -143,9 +143,7 @@ extension UTS18Tests { } func testProperties_XFail() { - XCTExpectFailure("Need to support 'age' and 'block' properties") { - // XCTAssertFalse("z".contains(#/\p{age=3.1}/#)) - XCTFail(#"\(#/\p{age=3.1}/#)"#) + XCTExpectFailure("Need to support 'block' properties") { // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) XCTFail(#"\(#/\p{Block=Greek}/#)"#) } @@ -490,6 +488,18 @@ extension UTS18Tests { } // Age + XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V1_1}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=14.0}"#))) + XCTAssertTrue("a".contains(regex(#"\p{age=V99_99}"#))) + + XCTAssertTrue("🥱".contains(regex(#"\p{age=12.0}"#))) + XCTAssertFalse("🥱".contains(regex(#"\p{age=11.0}"#))) + + XCTAssertTrue("⌁".contains(regex(#"\p{age=3.0}"#))) + XCTAssertFalse("⌁".contains(regex(#"\p{age=2.0}"#))) + XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#))) + // General_Category // Script (Script_Extensions) XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#))) From f6e6d414e35a8882685abc92c6d1f5e609722064 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sat, 7 May 2022 18:16:25 -0500 Subject: [PATCH 03/10] Improve diagnostics for unicode properties --- .../CharacterPropertyClassification.swift | 30 +++++++++++-------- .../Regex/Parse/Diagnostics.swift | 9 ++++++ Sources/_StringProcessing/ByteCodeGen.swift | 1 - Tests/RegexTests/ParseTests.swift | 6 ++++ 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index f82666282..1c54f800c 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -13,17 +13,17 @@ extension Source { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, match: (String) -> T? - ) -> T? { + _ str: String, match: (String) throws -> T? + ) rethrows -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. let str = str.filter { !$0.isWhitespace && $0 != "_" && $0 != "-" } .lowercased() - if let m = match(str) { + if let m = try match(str) { return m } - if str.hasPrefix("is"), let m = match(String(str.dropFirst(2))) { + if str.hasPrefix("is"), let m = try match(String(str.dropFirst(2))) { return m } return nil @@ -435,24 +435,28 @@ extension Source { // This uses the aliases defined in // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt. - let match = withNormalizedForms(key) { key -> PropertyKind? in + let match = try withNormalizedForms(key) { key -> PropertyKind? in switch key { case "script", "sc": - if let script = classifyScriptProperty(value) { - return .script(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .script(script) case "scriptextensions", "scx": - if let script = classifyScriptProperty(value) { - return .scriptExtension(script) + guard let script = classifyScriptProperty(value) else { + throw ParseError.unrecognizedScript(value) } + return .scriptExtension(script) case "gc", "generalcategory": - if let cat = classifyGeneralCategory(value) { - return .generalCategory(cat) + guard let cat = classifyGeneralCategory(value) else { + throw ParseError.unrecognizedCategory(value) } + return .generalCategory(cat) case "age": - if let (major, minor) = parseAge(value) { - return .age(major: major, minor: minor) + guard let (major, minor) = parseAge(value) else { + throw ParseError.invalidAge(value) } + return .age(major: major, minor: minor) case "name", "na": return .named(value) default: diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index c3d74c30b..e913a5b6f 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -59,6 +59,9 @@ enum ParseError: Error, Hashable { case emptyProperty case unknownProperty(key: String?, value: String) + case unrecognizedScript(String) + case unrecognizedCategory(String) + case invalidAge(String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -167,6 +170,12 @@ extension ParseError: CustomStringConvertible { return "extended syntax may not be disabled in multi-line mode" case .expectedCalloutArgument: return "expected argument to callout" + case .unrecognizedScript(let value): + return "unrecognized script '\(value)'" + case .unrecognizedCategory(let value): + return "unrecognized category '\(value)'" + case .invalidAge(let value): + return "invalid age format for '\(value)'. Use '3.0' or 'V3_0' formats." } } } diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index 2131d1eb5..a1e9f926d 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -170,7 +170,6 @@ extension Compiler.ByteCodeGen { mutating func emitCharacter(_ c: Character) throws { // Unicode scalar matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { - print("emitting '\(c)' as a sequence of \(c.unicodeScalars.count) scalars") for scalar in c.unicodeScalars { try emitScalar(scalar) } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 0ef021442..a8ead7ade 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2257,6 +2257,12 @@ extension RegexTests { diagnosticTest(#"\p{a=b"#, .unknownProperty(key: "a", value: "b")) diagnosticTest(#"\p{aaa[b]}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"\p{a=b=c}"#, .unknownProperty(key: "a", value: "b")) + diagnosticTest(#"\p{script=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{scx=Not_A_Script}"#, .unrecognizedScript("Not_A_Script")) + diagnosticTest(#"\p{gc=Not_A_Category}"#, .unrecognizedCategory("Not_A_Category")) + diagnosticTest(#"\p{age=3}"#, .invalidAge("3")) + diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3")) + diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) From a41a3b7beaebabf2681da5c38f25734b014897a7 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sun, 8 May 2022 21:59:44 -0500 Subject: [PATCH 04/10] More test additions No code changes needed --- Tests/RegexTests/UTS18Tests.swift | 48 +++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 69b58c24e..d45e09c88 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -164,11 +164,16 @@ extension UTS18Tests { expectFirstMatch(input, regex(#"[[:xdigit:]]+"#), input[pos: ..<6]) expectFirstMatch(input, regex(#"[[:alnum:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:space:]]+"#), input[pos: 12..<13]) - // TODO: blank - // TODO: cntrl expectFirstMatch(input, regex(#"[[:graph:]]+"#), input[pos: ..<11]) expectFirstMatch(input, regex(#"[[:print:]]+"#), input[...]) expectFirstMatch(input, regex(#"[[:word:]]+"#), input[pos: ..<11]) + + let blankAndControl = """ + \t\u{01}\u{19} + """ + // \t - tab is in both [:blank:] and [:cntrl:] + expectFirstMatch(blankAndControl, regex(#"[[:blank:]]+"#), blankAndControl[pos: ..<2]) + expectFirstMatch(blankAndControl, regex(#"[[:cntrl:]]+"#), blankAndControl[pos: 1...]) } //RL1.3 Subtraction and Intersection @@ -361,12 +366,15 @@ extension UTS18Tests { XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef\X$"#).matchingSemantics(.unicodeScalar))) XCTAssertTrue("abcdef🇬🇭".contains(regex(#"abcdef.+\y"#).matchingSemantics(.unicodeScalar))) + XCTAssertFalse("abcdef🇬🇭".contains(regex(#"abcdef.$"#).matchingSemantics(.unicodeScalar))) } func testCharacterClassesWithStrings() { let regex = regex(#"[a-z🧐🇧🇪🇧🇫🇧🇬]"#) XCTAssertTrue("🧐".contains(regex)) XCTAssertTrue("🇧🇫".contains(regex)) + XCTAssertTrue("🧐".contains(regex.matchingSemantics(.unicodeScalar))) + XCTAssertTrue("🇧🇫".contains(regex.matchingSemantics(.unicodeScalar))) } // RL2.3 Default Word Boundaries @@ -449,7 +457,7 @@ extension UTS18Tests { // XCTAssertTrue("^\u{3B1}\u{3B2}$".contains(#/[\N{GREEK SMALL LETTER ALPHA}-\N{GREEK SMALL LETTER BETA}]+/#)) } - XCTExpectFailure("Other named char failures -- investigate") { + XCTExpectFailure("Other named char failures -- name aliases") { XCTAssertTrue("\u{C}".contains(regex(#"\N{FORM FEED}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BYTE ORDER MARK}"#))) XCTAssertTrue("\u{FEFF}".contains(regex(#"\N{BOM}"#))) @@ -501,24 +509,58 @@ extension UTS18Tests { XCTAssertTrue("⌁".contains(regex(#"[\p{age=3.0}--\p{age=2.0}]"#))) // General_Category + XCTAssertTrue("a".contains(regex(#"\p{Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("a".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertFalse("A".contains(regex(#"\p{gc=Ll}"#))) + XCTAssertTrue("A".contains(regex(#"\p{gc=L}"#))) + + XCTAssertTrue("a".contains(regex(#"\p{Any}"#))) + XCTAssertTrue("a".contains(regex(#"\p{Assigned}"#))) + XCTAssertTrue("a".contains(regex(#"\p{ASCII}"#))) + // Script (Script_Extensions) XCTAssertTrue("a".contains(regex(#"\p{script=latin}"#))) XCTAssertTrue("강".contains(regex(#"\p{script=hangul}"#))) // White_Space + XCTAssertTrue(" ".contains(regex(#"\p{whitespace}"#))) + XCTAssertTrue("\n".contains(regex(#"\p{White_Space}"#))) + XCTAssertFalse("a".contains(regex(#"\p{whitespace}"#))) + // Alphabetic + XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#))) + // Hangul_Syllable_Type + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/\p{Hangul_Syllable_Type=L}/#)"#) + // XCTAssertTrue("ㄱ".contains(regex(#"\p{Hangul_Syllable_Type=L}"#))) + } + // Noncharacter_Code_Point + XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#))) + // Default_Ignorable_Code_Point + XCTAssertTrue("\u{00AD}".contains(regex(#"\p{Default_Ignorable_Code_Point}"#))) + // Deprecated + XCTAssertTrue("ʼn".contains(regex(#"\p{Deprecated}"#))) // Logical_Order_Exception + XCTAssertTrue("ແ".contains(regex(#"\p{Logical_Order_Exception}"#))) // Variation_Selector + XCTAssertTrue("\u{FE07}".contains(regex(#"\p{Variation_Selector}"#))) // MARK: Numeric // Numeric_Value // Numeric_Type // Hex_Digit + XCTAssertTrue("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdefg".contains(regex(#"^\p{Hex_Digit}+$"#))) // ASCII_Hex_Digit + XCTAssertTrue("0123456789abcdef".contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) + XCTAssertFalse("0123456789abcdef0123456789ABCDEF" + .contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) // MARK: Identifiers // ID_Continue From da07c34b21e0bc942cae68512e0bd8cc926ceaf6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 9 May 2022 13:16:01 -0500 Subject: [PATCH 05/10] Add support for `\p{nv=...}` and `\p{nt=...}` Numeric_Value and Numeric_Type Unicode property classes. --- Sources/_RegexParser/Regex/AST/Atom.swift | 6 +++++ .../CharacterPropertyClassification.swift | 23 +++++++++++++++++++ .../Regex/Parse/Diagnostics.swift | 8 ++++++- .../_StringProcessing/ConsumerInterface.swift | 6 +++++ Tests/RegexTests/ParseTests.swift | 4 ++++ Tests/RegexTests/UTS18Tests.swift | 8 +++++++ 6 files changed, 54 insertions(+), 1 deletion(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f31a57fc9..e551ebf4f 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -399,6 +399,12 @@ extension AST.Atom.CharacterProperty { /// Character name in the form `\p{name=...}` case named(String) + /// Numeric type. + case numericType(Unicode.NumericType) + + /// Numeric value. + case numericValue(Double) + /// Character age, as per UnicodeScalar.Properties.age. case age(major: Int, minor: Int) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 1c54f800c..e1110a0b8 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -79,6 +79,19 @@ extension Source { } } + static private func classifyNumericType( + _ str: String + ) -> Unicode.NumericType? { + withNormalizedForms(str) { str in + switch str { + case "decimal": return .decimal + case "digit": return .digit + case "numeric": return .numeric + default: return nil + } + } + } + static private func classifyBoolProperty( _ str: String ) -> Unicode.BinaryProperty? { @@ -459,6 +472,16 @@ extension Source { return .age(major: major, minor: minor) case "name", "na": return .named(value) + case "numericvalue", "nv": + guard let numericValue = Double(value) else { + throw ParseError.invalidNumericValue(value) + } + return .numericValue(numericValue) + case "numerictype", "nt": + guard let type = classifyNumericType(value) else { + throw ParseError.unrecognizedNumericType(value) + } + return .numericType(type) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index e913a5b6f..3d424d6f9 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -62,6 +62,8 @@ enum ParseError: Error, Hashable { case unrecognizedScript(String) case unrecognizedCategory(String) case invalidAge(String) + case invalidNumericValue(String) + case unrecognizedNumericType(String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -174,8 +176,12 @@ extension ParseError: CustomStringConvertible { return "unrecognized script '\(value)'" case .unrecognizedCategory(let value): return "unrecognized category '\(value)'" + case .unrecognizedNumericType(let value): + return "unrecognized numeric type '\(value)'" case .invalidAge(let value): - return "invalid age format for '\(value)'. Use '3.0' or 'V3_0' formats." + return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats" + case .invalidNumericValue(let value): + return "invalid numeric value '\(value)'" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 485bb58db..4a26d683c 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -494,6 +494,12 @@ extension AST.Atom.CharacterProperty { return age <= (major, minor) } + case .numericValue(let value): + return consume { $0.properties.numericValue == value } + + case .numericType(let type): + return consume { $0.properties.numericType == type } + case .posix(let p): return p.generateConsumer(opts) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index a8ead7ade..cdb3ff9e7 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2263,6 +2263,10 @@ extension RegexTests { diagnosticTest(#"\p{age=3}"#, .invalidAge("3")) diagnosticTest(#"\p{age=V3}"#, .invalidAge("V3")) diagnosticTest(#"\p{age=3.0.1}"#, .invalidAge("3.0.1")) + diagnosticTest(#"\p{nv=A}"#, .invalidNumericValue("A")) + diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4")) + diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType")) + diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index d45e09c88..d0ed1dd4c 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -552,7 +552,15 @@ extension UTS18Tests { // MARK: Numeric // Numeric_Value + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Value=3}"#))) + XCTAssertTrue("④".contains(regex(#"\p{Numeric_Value=4}"#))) + XCTAssertTrue("⅕".contains(regex(#"\p{Numeric_Value=0.2}"#))) + // Numeric_Type + XCTAssertTrue("3".contains(regex(#"\p{Numeric_Type=Decimal}"#))) + XCTAssertFalse("4".contains(regex(#"\p{Numeric_Type=Digit}"#))) + // Hex_Digit XCTAssertTrue("0123456789abcdef0123456789ABCDEF" .contains(regex(#"^\p{Hex_Digit}+$"#))) From 60779094e9b26bd7592abfbfe27120bc1d326e64 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Fri, 13 May 2022 14:10:32 -0500 Subject: [PATCH 06/10] Continue filling out the properties tests --- Tests/RegexTests/UTS18Tests.swift | 50 +++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index fad9d8cd9..24de3177e 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -591,21 +591,61 @@ extension UTS18Tests { .contains(regex(#"^\p{ASCII_Hex_Digit}+$"#))) // MARK: Identifiers - // ID_Continue // ID_Start - // XID_Continue + XCTAssertTrue("ABcd".contains(regex(#"^\p{ID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Start}"#))) + + // ID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{ID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{ID_Continue}"#))) + // XID_Start + XCTAssertTrue("ABcd".contains(regex(#"^\p{XID_Start}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Start}"#))) + + // XID_Continue + XCTAssertTrue("ABcd_1234".contains(regex(#"^\p{XID_Continue}+$"#))) + XCTAssertFalse(" ':`-".contains(regex(#"\p{XID_Continue}"#))) + // Pattern_Syntax + XCTAssertTrue(".+-:".contains(regex(#"^\p{Pattern_Syntax}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Pattern_Syntax}"#))) + // Pattern_White_Space + XCTAssertTrue(" \t\n".contains(regex(#"^\p{Pattern_White_Space}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Pattern_White_Space}"#))) + // Identifier_Status + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/\p{Identifier_Status=Allowed}/#)"#) + // XCTAssertTrue("A".contains(regex(#"\p{Identifier_Status=Allowed}"#))) + // XCTAssertFalse(" ".contains(regex(#"\p{Identifier_Status=Allowed}"#))) + // XCTAssertTrue(" ".contains(regex(#"\p{Identifier_Status=Restricted}"#))) + } // Identifier_Type + XCTExpectFailure { + XCTFail(#"Unsupported: \(#/\p{Identifier_Type=Inclusion}/#)"#) + // XCTAssertTrue("'".contains(regex(#"\p{Identifier_Type=Inclusion}"#))) + } // MARK: CJK // Ideographic + XCTAssertTrue("微笑".contains(regex(#"^\p{IsIdeographic}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Ideographic}"#))) + // Unified_Ideograph + XCTAssertTrue("微笑".contains(regex(#"^\p{Unified_Ideograph}+$"#))) + XCTAssertFalse("abc123".contains(regex(#"\p{Unified_Ideograph}"#))) + // Radical + XCTAssertTrue("⺁⺂⺆".contains(regex(#"^\p{Radical}+$"#))) + // IDS_Binary_Operator + XCTAssertTrue("⿰⿸⿻".contains(regex(#"^\p{IDS_Binary_Operator}+$"#))) + // IDS_Trinary_Operator + XCTAssertTrue("⿲⿳".contains(regex(#"^\p{IDS_Trinary_Operator}+$"#))) + // Equivalent_Unified_Ideograph XCTExpectFailure { XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) @@ -614,7 +654,13 @@ extension UTS18Tests { // MARK: Case // Uppercase + XCTAssertTrue("AÉÎØÜ".contains(regex(#"^\p{isUppercase}+$"#))) + XCTAssertFalse("123abc".contains(regex(#"^\p{isUppercase}+$"#))) + // Lowercase + XCTAssertTrue("aéîøü".contains(regex(#"^\p{Lowercase}+$"#))) + XCTAssertFalse("123abc".contains(regex(#"\p{Lowercase}+$"#))) + // Simple_Lowercase_Mapping // Simple_Titlecase_Mapping // Simple_Uppercase_Mapping From 5178502fce22630033d7bc9cc939d137e6090de1 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Sun, 15 May 2022 22:43:03 -0500 Subject: [PATCH 07/10] Add remaining supported Unicode binary properties --- Sources/_RegexParser/Regex/Parse/Sema.swift | 6 +- .../_StringProcessing/ConsumerInterface.swift | 2 +- Tests/RegexTests/UTS18Tests.swift | 154 +++++++++++++++++- 3 files changed, 157 insertions(+), 5 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 4de98557e..bba7a73c7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -127,8 +127,8 @@ extension RegexValidator { _ prop: Unicode.BinaryProperty, at loc: SourceLocation ) throws { switch prop { - case .asciiHexDigit, .alphabetic, .bidiMirrored, .cased, .caseIgnorable, - .changesWhenCasefolded, .changesWhenCasemapped, + case .asciiHexDigit, .alphabetic, .bidiControl, .bidiMirrored, .cased, + .caseIgnorable, .changesWhenCasefolded, .changesWhenCasemapped, .changesWhenNFKCCasefolded, .changesWhenLowercased, .changesWhenTitlecased, .changesWhenUppercased, .dash, .deprecated, .defaultIgnorableCodePoint, .diacratic, .extender, @@ -150,7 +150,7 @@ extension RegexValidator { case .expandsOnNFC, .expandsOnNFD, .expandsOnNFKD, .expandsOnNFKC: throw error(.deprecatedUnicode(prop.rawValue.quoted), at: loc) - case .bidiControl, .compositionExclusion, .emojiComponent, + case .compositionExclusion, .emojiComponent, .extendedPictographic, .graphemeLink, .hyphen, .otherAlphabetic, .otherDefaultIgnorableCodePoint, .otherGraphemeExtended, .otherIDContinue, .otherIDStart, .otherLowercase, .otherMath, diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 1765f07ac..91c7ed3c3 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -534,7 +534,7 @@ extension Unicode.BinaryProperty { case .alphabetic: return consume(propertyScalarPredicate(\.isAlphabetic)) case .bidiControl: - break + return consume(propertyScalarPredicate(\.isBidiControl)) case .bidiMirrored: return consume(propertyScalarPredicate(\.isBidiMirrored)) case .cased: diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 24de3177e..3c35466e6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -659,13 +659,28 @@ extension UTS18Tests { // Lowercase XCTAssertTrue("aéîøü".contains(regex(#"^\p{Lowercase}+$"#))) - XCTAssertFalse("123abc".contains(regex(#"\p{Lowercase}+$"#))) + XCTAssertFalse("123ABC".contains(regex(#"\p{Lowercase}+$"#))) // Simple_Lowercase_Mapping +// XCTAssertTrue("ABC".contains(regex(#"^\p{Simple_Lowercase_Mapping}+$"#))) +// XCTAssertFalse("abc123".contains(regex(#"\p{Simple_Lowercase_Mapping}"#))) + // Simple_Titlecase_Mapping +// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Titlecase_Mapping}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Simple_Titlecase_Mapping}"#))) + // Simple_Uppercase_Mapping +// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Uppercase_Mapping}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Simple_Uppercase_Mapping}"#))) + // Simple_Case_Folding +// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Case_Folding}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Simple_Case_Folding}"#))) + // Soft_Dotted + XCTAssertTrue("ijɨʝⅈⅉ".contains(regex(#"^\p{Soft_Dotted}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Soft_Dotted}"#))) + // Cased XCTAssertTrue("A".contains(regex(#"\p{Cased}"#))) XCTAssertTrue("A".contains(regex(#"\p{Is_Cased}"#))) @@ -700,59 +715,196 @@ extension UTS18Tests { // MARK: Normalization // Canonical_Combining_Class +// XCTAssertTrue("abc".contains(regex(#"^\p{Canonical_Combining_Class}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class}"#))) + // Decomposition_Type +// XCTAssertTrue("abc".contains(regex(#"^\p{Decomposition_Type}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Decomposition_Type}"#))) + // NFC_Quick_Check +// XCTAssertTrue("abc".contains(regex(#"^\p{NFC_Quick_Check}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{NFC_Quick_Check}"#))) + // NFKC_Quick_Check +// XCTAssertTrue("abc".contains(regex(#"^\p{NFKC_Quick_Check}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{NFKC_Quick_Check}"#))) + // NFD_Quick_Check +// XCTAssertTrue("abc".contains(regex(#"^\p{NFD_Quick_Check}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{NFD_Quick_Check}"#))) + // NFKD_Quick_Check +// XCTAssertTrue("abc".contains(regex(#"^\p{NFKD_Quick_Check}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{NFKD_Quick_Check}"#))) + // NFKC_Casefold +// XCTAssertTrue("abc".contains(regex(#"^\p{NFKC_Casefold}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{NFKC_Casefold}"#))) + // Changes_When_NFKC_Casefolded + XCTAssertTrue("ABCÊÖ".contains(regex(#"^\p{Changes_When_NFKC_Casefolded}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Changes_When_NFKC_Casefolded}"#))) // MARK: Emoji // Emoji + XCTAssertTrue("🥰🥳🤩".contains(regex(#"^\p{Emoji}+$"#))) + XCTAssertFalse("abc ◎✩℥".contains(regex(#"\p{Emoji}"#))) + // Emoji_Presentation + XCTAssertTrue("⌚☕☔".contains(regex(#"^\p{Emoji_Presentation}+$"#))) + XCTAssertFalse("abc ǽǮ".contains(regex(#"\p{Emoji_Presentation}"#))) + // Emoji_Modifier + XCTAssertTrue("\u{1F3FB}\u{1F3FC}\u{1F3FD}".contains(regex(#"^\p{Emoji_Modifier}+$"#))) + XCTAssertFalse("🧒".contains(regex(#"\p{Emoji_Modifier}"#))) + // Emoji_Modifier_Base + XCTAssertTrue("🧒".contains(regex(#"^\p{Emoji_Modifier_Base}+$"#))) + XCTAssertFalse("123 🧠".contains(regex(#"\p{Emoji_Modifier_Base}"#))) + // Emoji_Component +// XCTAssertTrue("abc".contains(regex(#"^\p{Emoji_Component}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Emoji_Component}"#))) + // Extended_Pictographic +// XCTAssertTrue("abc".contains(regex(#"^\p{Extended_Pictographic}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Extended_Pictographic}"#))) + // Basic_Emoji* +// XCTAssertTrue("abc".contains(regex(#"^\p{Basic_Emoji*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Basic_Emoji*}"#))) + // Emoji_Keycap_Sequence* +// XCTAssertTrue("abc".contains(regex(#"^\p{Emoji_Keycap_Sequence*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Emoji_Keycap_Sequence*}"#))) + // RGI_Emoji_Modifier_Sequence* +// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Modifier_Sequence*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Modifier_Sequence*}"#))) + // RGI_Emoji_Flag_Sequence* +// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Flag_Sequence*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Flag_Sequence*}"#))) + // RGI_Emoji_Tag_Sequence* +// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Tag_Sequence*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Tag_Sequence*}"#))) + // RGI_Emoji_ZWJ_Sequence* +// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_ZWJ_Sequence*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_ZWJ_Sequence*}"#))) + // RGI_Emoji* +// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji*}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji*}"#))) // MARK: Shaping and Rendering // Join_Control + XCTAssertTrue("\u{200C}\u{200D}".contains(regex(#"^\p{Join_Control}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Join_Control}"#))) + // Joining_Group +// XCTAssertTrue("abc".contains(regex(#"^\p{Joining_Group}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Joining_Group}"#))) + // Joining_Type +// XCTAssertTrue("abc".contains(regex(#"^\p{Joining_Type}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Joining_Type}"#))) + // Vertical_Orientation +// XCTAssertTrue("abc".contains(regex(#"^\p{Vertical_Orientation}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Vertical_Orientation}"#))) + // Line_Break +// XCTAssertTrue("abc".contains(regex(#"^\p{Line_Break}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Line_Break}"#))) + // Grapheme_Cluster_Break +// XCTAssertTrue("abc".contains(regex(#"^\p{Grapheme_Cluster_Break}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Grapheme_Cluster_Break}"#))) + // Sentence_Break +// XCTAssertTrue("abc".contains(regex(#"^\p{Sentence_Break}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Sentence_Break}"#))) + // Word_Break +// XCTAssertTrue("abc".contains(regex(#"^\p{Word_Break}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Word_Break}"#))) + // East_Asian_Width +// XCTAssertTrue("abc".contains(regex(#"^\p{East_Asian_Width}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{East_Asian_Width}"#))) + // Prepended_Concatenation_Mark +// XCTAssertTrue("abc".contains(regex(#"^\p{Prepended_Concatenation_Mark}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Prepended_Concatenation_Mark}"#))) // MARK: Bidirectional // Bidi_Class +// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Class}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Class}"#))) + // Bidi_Control + XCTAssertTrue("\u{200E}\u{200F}\u{2069}".contains(regex(#"^\p{Bidi_Control}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Control}"#))) + // Bidi_Mirrored + XCTAssertTrue("()<>{}❮❯«»".contains(regex(#"^\p{Bidi_Mirrored}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Mirrored}"#))) + // Bidi_Mirroring_Glyph +// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Mirroring_Glyph}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Mirroring_Glyph}"#))) + // Bidi_Paired_Bracket +// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Paired_Bracket}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Paired_Bracket}"#))) + // Bidi_Paired_Bracket_Type +// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Paired_Bracket_Type}+$"#))) +// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Paired_Bracket_Type}"#))) + // MARK: Miscellaneous // Math + XCTAssertTrue("𝒶𝖇𝕔𝖽𝗲𝘧𝙜𝚑𝛊𝜅𝝀𝝡𝞰𝟙𝟐𝟯𝟺".contains(regex(#"^\p{Math}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Math}"#))) + // Quotation_Mark + XCTAssertTrue(#"“«‘"’»”"#.contains(regex(#"^\p{Quotation_Mark}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Quotation_Mark}"#))) + // Dash + XCTAssertTrue("—-–".contains(regex(#"^\p{Dash}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Dash}"#))) + // Sentence_Terminal + XCTAssertTrue(".!?".contains(regex(#"^\p{Sentence_Terminal}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Sentence_Terminal}"#))) + // Terminal_Punctuation + XCTAssertTrue(":?!.".contains(regex(#"^\p{Terminal_Punctuation}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Terminal_Punctuation}"#))) + // Diacritic + XCTAssertTrue("¨`^¯ʸ".contains(regex(#"^\p{Diacritic}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Diacritic}"#))) + // Extender + XCTAssertTrue("ᪧː々".contains(regex(#"^\p{Extender}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Extender}"#))) + // Grapheme_Base + XCTAssertTrue("abc".contains(regex(#"^\p{Grapheme_Base}+$"#))) + XCTAssertFalse("\u{301}\u{FE0F}".contains(regex(#"\p{Grapheme_Base}"#))) + // Grapheme_Extend + XCTAssertTrue("\u{301}\u{302}\u{303}".contains(regex(#"^\p{Grapheme_Extend}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Grapheme_Extend}"#))) + // Regional_Indicator + XCTAssertTrue("🇰🇷🇬🇭🇵🇪".contains(regex(#"^\p{Regional_Indicator}+$"#))) + XCTAssertFalse("abc 123".contains(regex(#"\p{Regional_Indicator}"#))) } } From a8867244fb16edecf99fce0cd377764ef9971e7f Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 16 May 2022 00:54:32 -0500 Subject: [PATCH 08/10] Add support for simple case mapping properties --- Sources/_RegexParser/Regex/AST/Atom.swift | 9 +++++++++ .../Parse/CharacterPropertyClassification.swift | 10 ++++++++-- Sources/_RegexParser/Regex/Parse/Diagnostics.swift | 5 ++++- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 +- Sources/_StringProcessing/ConsumerInterface.swift | 9 +++++++++ Tests/RegexTests/ParseTests.swift | 2 ++ Tests/RegexTests/UTS18Tests.swift | 12 ++++++------ 7 files changed, 39 insertions(+), 10 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index cf332d62e..9f973de0c 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -433,6 +433,9 @@ extension AST.Atom.CharacterProperty { /// Numeric value. case numericValue(Double) + /// Case mapping. + case mapping(MapKind, String) + /// Character age, as per UnicodeScalar.Properties.age. case age(major: Int, minor: Int) @@ -441,6 +444,12 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) case onigurumaSpecial(OnigurumaSpecialProperty) + + public enum MapKind: Hashable { + case lowercase + case uppercase + case titlecase + } } // TODO: erm, separate out or fold into something? splat it in? diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index adcfb47e2..df5ab3886 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -448,8 +448,8 @@ extension Source { // This uses the aliases defined in // https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt. - let match = try withNormalizedForms(key) { key -> PropertyKind? in - switch key { + let match = try withNormalizedForms(key) { normalizedKey -> PropertyKind? in + switch normalizedKey { case "script", "sc": guard let script = classifyScriptProperty(value) else { throw ParseError.unrecognizedScript(value) @@ -482,6 +482,12 @@ extension Source { throw ParseError.unrecognizedNumericType(value) } return .numericType(type) + case "slc", "simplelowercasemapping": + return .mapping(.lowercase, value) + case "suc", "simpleuppercasemapping": + return .mapping(.uppercase, value) + case "stc", "simpletitlecasemapping": + return .mapping(.titlecase, value) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index ebafaffcb..f1ddd53f1 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -64,7 +64,8 @@ enum ParseError: Error, Hashable { case invalidAge(String) case invalidNumericValue(String) case unrecognizedNumericType(String) - + case expectedMapping(String) + case expectedGroupSpecifier case unbalancedEndOfGroup @@ -196,6 +197,8 @@ extension ParseError: CustomStringConvertible { return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats" case .invalidNumericValue(let value): return "invalid numeric value '\(value)'" + case .expectedMapping(let mapping): + return "missing value for '\(mapping)'" // MARK: Semantic Errors diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index bba7a73c7..8332afec6 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -169,7 +169,7 @@ extension RegexValidator { case .binary(let b, _): try validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, - .scriptExtension, .age, .numericType, .numericValue: + .scriptExtension, .age, .numericType, .numericValue, .mapping: break case .pcreSpecial: throw error(.unsupported("PCRE property"), at: loc) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 91c7ed3c3..a54017296 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -500,6 +500,15 @@ extension AST.Atom.CharacterProperty { case .numericType(let type): return consume { $0.properties.numericType == type } + case .mapping(.lowercase, let value): + return consume { $0.properties.lowercaseMapping == value } + + case .mapping(.uppercase, let value): + return consume { $0.properties.uppercaseMapping == value } + + case .mapping(.titlecase, let value): + return consume { $0.properties.titlecaseMapping == value } + case .posix(let p): return p.generateConsumer(opts) diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index a2f44c013..e0eef16ea 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2473,6 +2473,8 @@ extension RegexTests { diagnosticTest(#"\p{Numeric_Value=1.2.3.4}"#, .invalidNumericValue("1.2.3.4")) diagnosticTest(#"\p{nt=Not_A_NumericType}"#, .unrecognizedNumericType("Not_A_NumericType")) diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping")) + diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 3c35466e6..04f6ff7b9 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -662,16 +662,16 @@ extension UTS18Tests { XCTAssertFalse("123ABC".contains(regex(#"\p{Lowercase}+$"#))) // Simple_Lowercase_Mapping -// XCTAssertTrue("ABC".contains(regex(#"^\p{Simple_Lowercase_Mapping}+$"#))) -// XCTAssertFalse("abc123".contains(regex(#"\p{Simple_Lowercase_Mapping}"#))) + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Lowercase_Mapping=a}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Lowercase_Mapping=a}"#))) // Simple_Titlecase_Mapping -// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Titlecase_Mapping}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Simple_Titlecase_Mapping}"#))) + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Titlecase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Titlecase_Mapping=A}"#))) // Simple_Uppercase_Mapping -// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Uppercase_Mapping}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Simple_Uppercase_Mapping}"#))) + XCTAssertTrue("aAa".contains(regex(#"^\p{Simple_Uppercase_Mapping=A}+$"#))) + XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Uppercase_Mapping=A}"#))) // Simple_Case_Folding // XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Case_Folding}+$"#))) From 150f21ab4f510b0daa095d1a90b022a6d43a6c7f Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 16 May 2022 01:04:15 -0500 Subject: [PATCH 09/10] Add support for canonical combining class --- Sources/_RegexParser/Regex/AST/Atom.swift | 3 +++ .../Regex/Parse/CharacterPropertyClassification.swift | 5 +++++ Sources/_RegexParser/Regex/Parse/Diagnostics.swift | 6 +++--- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 +- Sources/_StringProcessing/ConsumerInterface.swift | 3 +++ Tests/RegexTests/ParseTests.swift | 2 ++ Tests/RegexTests/UTS18Tests.swift | 4 ++-- 7 files changed, 19 insertions(+), 6 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 9f973de0c..6721076fc 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -436,6 +436,9 @@ extension AST.Atom.CharacterProperty { /// Case mapping. case mapping(MapKind, String) + /// Canonical Combining Class. + case ccc(Unicode.CanonicalCombiningClass) + /// Character age, as per UnicodeScalar.Properties.age. case age(major: Int, minor: Int) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index df5ab3886..21b5ddc68 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -488,6 +488,11 @@ extension Source { return .mapping(.uppercase, value) case "stc", "simpletitlecasemapping": return .mapping(.titlecase, value) + case "ccc", "canonicalcombiningclass": + guard let cccValue = UInt8(value), cccValue <= 254 else { + throw ParseError.invalidCCC(value) + } + return .ccc(.init(rawValue: cccValue)) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index f1ddd53f1..05bf4ba1a 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -64,7 +64,7 @@ enum ParseError: Error, Hashable { case invalidAge(String) case invalidNumericValue(String) case unrecognizedNumericType(String) - case expectedMapping(String) + case invalidCCC(String) case expectedGroupSpecifier case unbalancedEndOfGroup @@ -197,8 +197,8 @@ extension ParseError: CustomStringConvertible { return "invalid age format for '\(value)' - use '3.0' or 'V3_0' formats" case .invalidNumericValue(let value): return "invalid numeric value '\(value)'" - case .expectedMapping(let mapping): - return "missing value for '\(mapping)'" + case .invalidCCC(let value): + return "invalid canonical combining class '\(value)'" // MARK: Semantic Errors diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 8332afec6..be28754b8 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -169,7 +169,7 @@ extension RegexValidator { case .binary(let b, _): try validateBinaryProperty(b, at: loc) case .any, .assigned, .ascii, .generalCategory, .posix, .named, .script, - .scriptExtension, .age, .numericType, .numericValue, .mapping: + .scriptExtension, .age, .numericType, .numericValue, .mapping, .ccc: break case .pcreSpecial: throw error(.unsupported("PCRE property"), at: loc) diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a54017296..334635585 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -500,6 +500,9 @@ extension AST.Atom.CharacterProperty { case .numericType(let type): return consume { $0.properties.numericType == type } + case .ccc(let ccc): + return consume { $0.properties.canonicalCombiningClass == ccc } + case .mapping(.lowercase, let value): return consume { $0.properties.lowercaseMapping == value } diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index e0eef16ea..332b7fe29 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -2475,6 +2475,8 @@ extension RegexTests { diagnosticTest(#"\p{Numeric_Type=Nuemric}"#, .unrecognizedNumericType("Nuemric")) diagnosticTest(#"\p{Simple_Lowercase_Mapping}"#, .unknownProperty(key: nil, value: "Simple_Lowercase_Mapping")) diagnosticTest(#"\p{Simple_Lowercase_Mapping=}"#, .emptyProperty) + diagnosticTest(#"\p{ccc=255}"#, .invalidCCC("255")) + diagnosticTest(#"\p{ccc=Nada}"#, .invalidCCC("Nada")) diagnosticTest(#"(?#"#, .expected(")")) diagnosticTest(#"(?x"#, .expected(")")) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 04f6ff7b9..1792b5eff 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -715,8 +715,8 @@ extension UTS18Tests { // MARK: Normalization // Canonical_Combining_Class -// XCTAssertTrue("abc".contains(regex(#"^\p{Canonical_Combining_Class}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class}"#))) + XCTAssertTrue("\u{0321}\u{0322}\u{1DD0}".contains(regex(#"^\p{Canonical_Combining_Class=202}+$"#))) + XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class=202}"#))) // Decomposition_Type // XCTAssertTrue("abc".contains(regex(#"^\p{Decomposition_Type}+$"#))) From d97be11ded594ea3a187db111ca33509b56c7ade Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Mon, 16 May 2022 07:39:14 -0500 Subject: [PATCH 10/10] Clarify unsupported properties in tests --- Tests/RegexTests/UTS18Tests.swift | 239 ++++++++++++++++++------------ 1 file changed, 148 insertions(+), 91 deletions(-) diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index 1792b5eff..7306632da 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -151,10 +151,8 @@ extension UTS18Tests { } func testProperties_XFail() { - XCTExpectFailure("Need to support 'block' properties") { - // XCTAssertTrue("\u{1F00}".contains(#/\p{Block=Greek}/#)) - XCTFail(#"\(#/\p{Block=Greek}/#)"#) - } + // Certain properties are unsupported, see below. + XCTAssertThrowsError(try Regex(#"\p{Block=Greek}"#)) } // RL1.2a Compatibility Properties @@ -495,7 +493,8 @@ extension UTS18Tests { // To meet this requirement, an implementation shall support wildcards in // Unicode property values. func testWildcardsInPropertyValues() { - XCTExpectFailure { XCTFail("Implement tests") } + // Unsupported + XCTAssertThrowsError(try Regex(#"\p{name=/a/"#)) } // RL2.7 Full Properties @@ -510,10 +509,7 @@ extension UTS18Tests { XCTAssertTrue("a".contains(regex(#"\p{name=latin small letter a}"#))) // Block - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/^\p{block=Block Elements}+$/#)"#) - // XCTAssertTrue("▂▃▄▅▆▇".contains(regex(#"^\p{block=Block Elements}+$"#))) - } + // Unsupported // Age XCTAssertTrue("a".contains(regex(#"\p{age=1.1}"#))) @@ -552,10 +548,7 @@ extension UTS18Tests { XCTAssertTrue("aéîøüƒ".contains(regex(#"^\p{Alphabetic}+$"#))) // Hangul_Syllable_Type - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/\p{Hangul_Syllable_Type=L}/#)"#) - // XCTAssertTrue("ㄱ".contains(regex(#"\p{Hangul_Syllable_Type=L}"#))) - } + // Unsupported // Noncharacter_Code_Point XCTAssertTrue("\u{10FFFF}".contains(regex(#"\p{Noncharacter_Code_Point}"#))) @@ -616,17 +609,10 @@ extension UTS18Tests { XCTAssertFalse("abc123".contains(regex(#"\p{Pattern_White_Space}"#))) // Identifier_Status - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/\p{Identifier_Status=Allowed}/#)"#) - // XCTAssertTrue("A".contains(regex(#"\p{Identifier_Status=Allowed}"#))) - // XCTAssertFalse(" ".contains(regex(#"\p{Identifier_Status=Allowed}"#))) - // XCTAssertTrue(" ".contains(regex(#"\p{Identifier_Status=Restricted}"#))) - } + // Unsupported + // Identifier_Type - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/\p{Identifier_Type=Inclusion}/#)"#) - // XCTAssertTrue("'".contains(regex(#"\p{Identifier_Type=Inclusion}"#))) - } + // Unsupported // MARK: CJK // Ideographic @@ -647,10 +633,7 @@ extension UTS18Tests { XCTAssertTrue("⿲⿳".contains(regex(#"^\p{IDS_Trinary_Operator}+$"#))) // Equivalent_Unified_Ideograph - XCTExpectFailure { - XCTFail(#"Unsupported: \(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)"#) - // XCTAssertTrue("⼚⺁厂".contains(#/^\p{Equivalent_Unified_Ideograph=⼚}+$/#)) - } + // Unsupported // MARK: Case // Uppercase @@ -674,8 +657,7 @@ extension UTS18Tests { XCTAssertFalse("bBå".contains(regex(#"\p{Simple_Uppercase_Mapping=A}"#))) // Simple_Case_Folding -// XCTAssertTrue("abc".contains(regex(#"^\p{Simple_Case_Folding}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Simple_Case_Folding}"#))) + // Unsupported // Soft_Dotted XCTAssertTrue("ijɨʝⅈⅉ".contains(regex(#"^\p{Soft_Dotted}+$"#))) @@ -712,35 +694,29 @@ extension UTS18Tests { // Changes_When_Casemapped XCTAssertTrue("a".contains(regex(#"\p{Changes_When_Casemapped}"#))) XCTAssertFalse(":".contains(regex(#"\p{Changes_When_Casemapped}"#))) - + // MARK: Normalization // Canonical_Combining_Class XCTAssertTrue("\u{0321}\u{0322}\u{1DD0}".contains(regex(#"^\p{Canonical_Combining_Class=202}+$"#))) XCTAssertFalse("123".contains(regex(#"\p{Canonical_Combining_Class=202}"#))) - + // Decomposition_Type -// XCTAssertTrue("abc".contains(regex(#"^\p{Decomposition_Type}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Decomposition_Type}"#))) - + // Unsupported + // NFC_Quick_Check -// XCTAssertTrue("abc".contains(regex(#"^\p{NFC_Quick_Check}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{NFC_Quick_Check}"#))) - + // Unsupported + // NFKC_Quick_Check -// XCTAssertTrue("abc".contains(regex(#"^\p{NFKC_Quick_Check}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{NFKC_Quick_Check}"#))) - + // Unsupported + // NFD_Quick_Check -// XCTAssertTrue("abc".contains(regex(#"^\p{NFD_Quick_Check}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{NFD_Quick_Check}"#))) + // Unsupported // NFKD_Quick_Check -// XCTAssertTrue("abc".contains(regex(#"^\p{NFKD_Quick_Check}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{NFKD_Quick_Check}"#))) + // Unsupported // NFKC_Casefold -// XCTAssertTrue("abc".contains(regex(#"^\p{NFKC_Casefold}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{NFKC_Casefold}"#))) + // Unsupported // Changes_When_NFKC_Casefolded XCTAssertTrue("ABCÊÖ".contains(regex(#"^\p{Changes_When_NFKC_Casefolded}+$"#))) @@ -764,40 +740,31 @@ extension UTS18Tests { XCTAssertFalse("123 🧠".contains(regex(#"\p{Emoji_Modifier_Base}"#))) // Emoji_Component -// XCTAssertTrue("abc".contains(regex(#"^\p{Emoji_Component}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Emoji_Component}"#))) + // Unsupported // Extended_Pictographic -// XCTAssertTrue("abc".contains(regex(#"^\p{Extended_Pictographic}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Extended_Pictographic}"#))) + // Unsupported // Basic_Emoji* -// XCTAssertTrue("abc".contains(regex(#"^\p{Basic_Emoji*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Basic_Emoji*}"#))) + // Unsupported // Emoji_Keycap_Sequence* -// XCTAssertTrue("abc".contains(regex(#"^\p{Emoji_Keycap_Sequence*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Emoji_Keycap_Sequence*}"#))) + // Unsupported // RGI_Emoji_Modifier_Sequence* -// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Modifier_Sequence*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Modifier_Sequence*}"#))) + // Unsupported // RGI_Emoji_Flag_Sequence* -// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Flag_Sequence*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Flag_Sequence*}"#))) + // Unsupported // RGI_Emoji_Tag_Sequence* -// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_Tag_Sequence*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_Tag_Sequence*}"#))) + // Unsupported // RGI_Emoji_ZWJ_Sequence* -// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji_ZWJ_Sequence*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji_ZWJ_Sequence*}"#))) + // Unsupported // RGI_Emoji* -// XCTAssertTrue("abc".contains(regex(#"^\p{RGI_Emoji*}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{RGI_Emoji*}"#))) + // Unsupported // MARK: Shaping and Rendering // Join_Control @@ -805,45 +772,35 @@ extension UTS18Tests { XCTAssertFalse("123".contains(regex(#"\p{Join_Control}"#))) // Joining_Group -// XCTAssertTrue("abc".contains(regex(#"^\p{Joining_Group}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Joining_Group}"#))) + // Unsupported // Joining_Type -// XCTAssertTrue("abc".contains(regex(#"^\p{Joining_Type}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Joining_Type}"#))) + // Unsupported // Vertical_Orientation -// XCTAssertTrue("abc".contains(regex(#"^\p{Vertical_Orientation}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Vertical_Orientation}"#))) + // Unsupported // Line_Break -// XCTAssertTrue("abc".contains(regex(#"^\p{Line_Break}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Line_Break}"#))) + // Unsupported // Grapheme_Cluster_Break -// XCTAssertTrue("abc".contains(regex(#"^\p{Grapheme_Cluster_Break}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Grapheme_Cluster_Break}"#))) + // Unsupported // Sentence_Break -// XCTAssertTrue("abc".contains(regex(#"^\p{Sentence_Break}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Sentence_Break}"#))) + // Unsupported // Word_Break -// XCTAssertTrue("abc".contains(regex(#"^\p{Word_Break}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Word_Break}"#))) + // Unsupported // East_Asian_Width -// XCTAssertTrue("abc".contains(regex(#"^\p{East_Asian_Width}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{East_Asian_Width}"#))) + // Unsupported // Prepended_Concatenation_Mark -// XCTAssertTrue("abc".contains(regex(#"^\p{Prepended_Concatenation_Mark}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Prepended_Concatenation_Mark}"#))) + // Unsupported // MARK: Bidirectional // Bidi_Class -// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Class}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Class}"#))) + // Unsupported // Bidi_Control XCTAssertTrue("\u{200E}\u{200F}\u{2069}".contains(regex(#"^\p{Bidi_Control}+$"#))) @@ -854,17 +811,13 @@ extension UTS18Tests { XCTAssertFalse("abc 123".contains(regex(#"\p{Bidi_Mirrored}"#))) // Bidi_Mirroring_Glyph -// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Mirroring_Glyph}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Mirroring_Glyph}"#))) + // Unsupported // Bidi_Paired_Bracket -// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Paired_Bracket}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Paired_Bracket}"#))) + // Unsupported // Bidi_Paired_Bracket_Type -// XCTAssertTrue("abc".contains(regex(#"^\p{Bidi_Paired_Bracket_Type}+$"#))) -// XCTAssertFalse("123".contains(regex(#"\p{Bidi_Paired_Bracket_Type}"#))) - + // Unsupported // MARK: Miscellaneous // Math @@ -907,4 +860,108 @@ extension UTS18Tests { XCTAssertTrue("🇰🇷🇬🇭🇵🇪".contains(regex(#"^\p{Regional_Indicator}+$"#))) XCTAssertFalse("abc 123".contains(regex(#"\p{Regional_Indicator}"#))) } + + func testFullProperties_Unsupported() { + // Block + XCTAssertThrowsError(try Regex(#"\p{block=Block_Elements}"#)) + + // Hangul_Syllable_Type + XCTAssertThrowsError(try Regex(#"\p{Hangul_Syllable_Type=L}/"#)) + + // Identifier_Status + XCTAssertThrowsError(try Regex(#"\p{Identifier_Status=Allowed}"#)) + + // Identifier_Type + XCTAssertThrowsError(try Regex(#"\p{Identifier_Type=Inclusion}/"#)) + + // Equivalent_Unified_Ideograph + XCTAssertThrowsError(try Regex(#"\p{Equivalent_Unified_Ideograph=⼚}"#)) + + // Simple_Case_Folding + XCTAssertThrowsError(try Regex(#"\p{Simple_Case_Folding=a}/"#)) + + // Decomposition_Type + XCTAssertThrowsError(try Regex(#"\p{Decomposition_Type}"#)) + + // NFC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFC_Quick_Check}"#)) + + // NFKC_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKC_Quick_Check}"#)) + + // NFD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFD_Quick_Check}"#)) + + // NFKD_Quick_Check + XCTAssertThrowsError(try Regex(#"\p{NFKD_Quick_Check}"#)) + + // NFKC_Casefold + XCTAssertThrowsError(try Regex(#"\p{NFKC_Casefold}"#)) + + // Emoji_Component + XCTAssertThrowsError(try Regex(#"\p{Emoji_Component}"#)) + + // Extended_Pictographic + XCTAssertThrowsError(try Regex(#"\p{Extended_Pictographic}"#)) + + // Basic_Emoji* + XCTAssertThrowsError(try Regex(#"\p{Basic_Emoji*}"#)) + + // Emoji_Keycap_Sequence* + XCTAssertThrowsError(try Regex(#"\p{Emoji_Keycap_Sequence*}"#)) + + // RGI_Emoji_Modifier_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Modifier_Sequence*}"#)) + + // RGI_Emoji_Flag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Flag_Sequence*}"#)) + + // RGI_Emoji_Tag_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_Tag_Sequence*}"#)) + + // RGI_Emoji_ZWJ_Sequence* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji_ZWJ_Sequence*}"#)) + + // RGI_Emoji* + XCTAssertThrowsError(try Regex(#"\p{RGI_Emoji*}"#)) + + // Joining_Group + XCTAssertThrowsError(try Regex(#"\p{Joining_Group}"#)) + + // Joining_Type + XCTAssertThrowsError(try Regex(#"\p{Joining_Type}"#)) + + // Vertical_Orientation + XCTAssertThrowsError(try Regex(#"\p{Vertical_Orientation}"#)) + + // Line_Break + XCTAssertThrowsError(try Regex(#"\p{Line_Break}"#)) + + // Grapheme_Cluster_Break + XCTAssertThrowsError(try Regex(#"\p{Grapheme_Cluster_Break}"#)) + + // Sentence_Break + XCTAssertThrowsError(try Regex(#"\p{Sentence_Break}"#)) + + // Word_Break + XCTAssertThrowsError(try Regex(#"\p{Word_Break}"#)) + + // East_Asian_Width + XCTAssertThrowsError(try Regex(#"\p{East_Asian_Width}"#)) + + // Prepended_Concatenation_Mark + XCTAssertThrowsError(try Regex(#"\p{Prepended_Concatenation_Mark}"#)) + + // Bidi_Class + XCTAssertThrowsError(try Regex(#"\p{Bidi_Class}"#)) + + // Bidi_Mirroring_Glyph + XCTAssertThrowsError(try Regex(#"\p{Bidi_Mirroring_Glyph}"#)) + + // Bidi_Paired_Bracket + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket}"#)) + + // Bidi_Paired_Bracket_Type + XCTAssertThrowsError(try Regex(#"\p{Bidi_Paired_Bracket_Type}"#)) + } }