From 13342eb03bf0d44dfa8608f9a4fad7176970bf43 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 3 May 2022 13:49:10 +0100 Subject: [PATCH 1/2] Add matching support for `\p{Lc}` This is defined in UAX#44 as being equivalent to `Lu | Ll | Lt`. --- .gitignore | 3 +++ Sources/_StringProcessing/ConsumerInterface.swift | 5 +++-- Tests/RegexTests/MatchTests.swift | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a7e7e4d09..ff85b9fa3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ .DS_Store +# The current toolchain is dumping files in the package root, rude +*.emit-module.* + # Xcode # # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 356b7cc4b..a44c2c876 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -691,8 +691,9 @@ extension Unicode.ExtendedGeneralCategory { ]) case .casedLetter: - throw Unsupported( - "TODO: cased letter? not the property?") + return consumeScalarGCs([ + .uppercaseLetter, .lowercaseLetter, .titlecaseLetter + ]) case .control: return consumeScalarGC(.control) diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 345e80e22..769538b74 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -693,6 +693,12 @@ extension RegexTests { firstMatchTest(#"\p{gc=L}"#, input: "123abcXYZ", match: "a") firstMatchTest(#"\p{Lu}"#, input: "123abcXYZ", match: "X") + // U+0374 GREEK NUMERAL SIGN (Lm) + // U+00AA FEMININE ORDINAL INDICATOR (Lo) + firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") firstMatchTest( From 925f51bc863aac2ddaa640a01c28843dd48ad5fc Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Tue, 3 May 2022 13:49:11 +0100 Subject: [PATCH 2/2] Add parser support for `\p{L&}` This is a PCRE spelling for a cased letter. --- .../Regex/Parse/CharacterPropertyClassification.swift | 6 +++--- Tests/RegexTests/MatchTests.swift | 2 ++ Tests/RegexTests/ParseTests.swift | 3 +++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 911312121..5cc920063 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -32,8 +32,8 @@ extension Source { static private func classifyGeneralCategory( _ str: String ) -> Unicode.ExtendedGeneralCategory? { - // This uses the aliases defined in - // https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // This uses the aliases defined in https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt. + // Additionally, uses the `L& = Lc` alias defined by PCRE. withNormalizedForms(str) { str in switch str { case "c", "other": return .other @@ -43,7 +43,7 @@ extension Source { case "co", "privateuse": return .privateUse case "cs", "surrogate": return .surrogate case "l", "letter": return .letter - case "lc", "casedletter": return .casedLetter + case "lc", "l&", "casedletter": return .casedLetter case "ll", "lowercaseletter": return .lowercaseLetter case "lm", "modifierletter": return .modifierLetter case "lo", "otherletter": return .otherLetter diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index 769538b74..2c6b858cc 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -698,6 +698,8 @@ extension RegexTests { firstMatchTest(#"\p{L}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "\u{0374}") firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") firstMatchTest(#"\p{Lc}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123abcXYZ", match: "a") + firstMatchTest(#"\p{L&}"#, input: "\u{0374}\u{00AA}123XYZ", match: "X") firstMatchTest( #"\P{Cc}"#, input: "\n\n\nXYZ", match: "X") diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index aeefe6477..f0013b158 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1156,6 +1156,9 @@ extension RegexTests { #"\p{C}+"#, oneOrMore(of: prop(.generalCategory(.other)))) + // L& defined by PCRE. + parseTest(#"\p{L&}"#, prop(.generalCategory(.casedLetter))) + // UAX44-LM3 means all of the below are equivalent. let lowercaseLetter = prop(.generalCategory(.lowercaseLetter)) parseTest(#"\p{ll}"#, lowercaseLetter)