Skip to content

Commit 7db6d42

Browse files
[3.13] gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557) (GH-126689)
* upper-case non-BMP character was ignored * the ASCII flag was ignored when matching a character range whose upper bound is beyond the BMP region (cherry picked from commit 819830f) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent fc10908 commit 7db6d42

File tree

3 files changed

+73
-9
lines changed

3 files changed

+73
-9
lines changed

Lib/re/_compiler.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -248,19 +248,19 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
248248
while True:
249249
try:
250250
if op is LITERAL:
251-
if fixup:
252-
lo = fixup(av)
253-
charmap[lo] = 1
254-
if fixes and lo in fixes:
255-
for k in fixes[lo]:
251+
if fixup: # IGNORECASE and not LOCALE
252+
av = fixup(av)
253+
charmap[av] = 1
254+
if fixes and av in fixes:
255+
for k in fixes[av]:
256256
charmap[k] = 1
257257
if not hascased and iscased(av):
258258
hascased = True
259259
else:
260260
charmap[av] = 1
261261
elif op is RANGE:
262262
r = range(av[0], av[1]+1)
263-
if fixup:
263+
if fixup: # IGNORECASE and not LOCALE
264264
if fixes:
265265
for i in map(fixup, r):
266266
charmap[i] = 1
@@ -287,8 +287,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
287287
# Character set contains non-BMP character codes.
288288
# For range, all BMP characters in the range are already
289289
# proceeded.
290-
if fixup:
291-
hascased = True
290+
if fixup: # IGNORECASE and not LOCALE
292291
# For now, IN_UNI_IGNORE+LITERAL and
293292
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
294293
# characters, because two characters (at least one of
@@ -299,7 +298,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
299298
# Also, both c.lower() and c.lower().upper() are single
300299
# characters for every non-BMP character.
301300
if op is RANGE:
302-
op = RANGE_UNI_IGNORE
301+
if fixes: # not ASCII
302+
op = RANGE_UNI_IGNORE
303+
hascased = True
304+
else:
305+
assert op is LITERAL
306+
if not hascased and iscased(av):
307+
hascased = True
303308
tail.append((op, av))
304309
break
305310

Lib/test/test_re.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,6 +1137,39 @@ def test_ignore_case_set(self):
11371137
self.assertTrue(re.match(br'[19a]', b'a', re.I))
11381138
self.assertTrue(re.match(br'[19a]', b'A', re.I))
11391139
self.assertTrue(re.match(br'[19A]', b'a', re.I))
1140+
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
1141+
self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
1142+
self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
1143+
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
1144+
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
1145+
self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
1146+
self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
1147+
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
1148+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
1149+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
1150+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
1151+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))
1152+
1153+
self.assertTrue(re.match(br'[19A]', b'A', re.I))
1154+
self.assertTrue(re.match(br'[19a]', b'a', re.I))
1155+
self.assertTrue(re.match(br'[19a]', b'A', re.I))
1156+
self.assertTrue(re.match(br'[19A]', b'a', re.I))
1157+
self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
1158+
self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
1159+
self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
1160+
self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
1161+
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
1162+
self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
1163+
self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
1164+
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
1165+
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
1166+
self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
1167+
self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
1168+
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
1169+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
1170+
self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
1171+
self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
1172+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))
11401173

11411174
# Two different characters have the same lowercase.
11421175
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
@@ -1173,8 +1206,10 @@ def test_ignore_case_range(self):
11731206
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
11741207
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
11751208
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
1209+
self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
11761210
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
11771211
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
1212+
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
11781213
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
11791214
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
11801215
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
@@ -1185,6 +1220,26 @@ def test_ignore_case_range(self):
11851220
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
11861221
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
11871222

1223+
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
1224+
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
1225+
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
1226+
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
1227+
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
1228+
self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
1229+
self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
1230+
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
1231+
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
1232+
self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
1233+
self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
1234+
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))
1235+
1236+
self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
1237+
self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
1238+
self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
1239+
self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
1240+
self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
1241+
self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))
1242+
11881243
# Two different characters have the same lowercase.
11891244
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
11901245
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix bugs in compiling case-insensitive :mod:`regular expressions <re>` with
2+
character classes containing non-BMP characters: upper-case non-BMP
3+
character did was ignored and the ASCII flag was ignored when
4+
matching a character range whose upper bound is beyond the BMP region.

0 commit comments

Comments
 (0)