diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index 0087d734a..b7d8454bb 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -51,6 +51,10 @@ extension RegexComponent where Self == CharacterClass { public static var anyGrapheme: CharacterClass { .init(unconverted: .anyGrapheme) } + + public static var anyUnicodeScalar: CharacterClass { + .init(unconverted: .anyUnicodeScalar) + } public static var whitespace: CharacterClass { .init(unconverted: .whitespace) diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index 2debcda9d..c02725e33 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -33,6 +33,8 @@ public struct _CharacterClassModel: Hashable { case any /// Any grapheme cluster case anyGrapheme + /// Any Unicode scalar + case anyScalar /// Character.isDigit case digit /// Character.isHexDigit @@ -159,8 +161,12 @@ public struct _CharacterClassModel: Hashable { case .graphemeCluster: let c = str[i] var matched: Bool + var next = str.index(after: i) switch cc { case .any, .anyGrapheme: matched = true + case .anyScalar: + matched = true + next = str.unicodeScalars.index(after: i) case .digit: matched = c.isNumber && (c.isASCII || !options.usesASCIIDigits) case .hexDigit: @@ -178,12 +184,13 @@ public struct _CharacterClassModel: Hashable { if isInverted { matched.toggle() } - return matched ? str.index(after: i) : nil + return matched ? next : nil case .unicodeScalar: let c = str.unicodeScalars[i] var matched: Bool switch cc { case .any: matched = true + case .anyScalar: matched = true case .anyGrapheme: fatalError("Not matched in this mode") case .digit: matched = c.properties.numericType != nil && (c.isASCII || !options.usesASCIIDigits) @@ -228,6 +235,10 @@ extension _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } + public static var anyUnicodeScalar: _CharacterClassModel { + .init(cc: .any, matchLevel: .unicodeScalar) + } + public static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } @@ -279,6 +290,7 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { switch self { case .any: return "" case .anyGrapheme: return "" + case .anyScalar: return "" case .digit: return "" case .hexDigit: return "" case .horizontalWhitespace: return "" @@ -445,6 +457,7 @@ extension AST.Atom.EscapedBuiltin { case .notWordCharacter: return .word.inverted case .graphemeCluster: return .anyGrapheme + case .trueAnychar: return .anyUnicodeScalar default: return nil diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e00c77f56..dab53cc1c 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1512,13 +1512,11 @@ extension RegexTests { (eDecomposed, false)) // FIXME: \O is unsupported - firstMatchTest(#"\O\u{301}"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e\O"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"\O\u{301}"#, input: eComposed, match: nil, - xfail: true) - firstMatchTest(#"e\O"#, input: eComposed, match: nil, + firstMatchTest(#"(?u)\O\u{301}"#, input: eDecomposed, match: eDecomposed) + firstMatchTest(#"(?u)e\O"#, input: eDecomposed, match: eDecomposed, + xfail: true) + firstMatchTest(#"\O"#, input: eComposed, match: eComposed) + firstMatchTest(#"\O"#, input: eDecomposed, match: nil, xfail: true) matchTest(