Skip to content

Commit 7d091d5

Browse files
[3.12] gh-126505: Fix bugs in compiling case-insensitive character classes (GH-126557) (GH-126690)
* upper-case non-BMP character was ignored * the ASCII flag was ignored when matching a character range whose upper bound is beyond the BMP region (cherry picked from commit 819830f) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent aee80cd commit 7d091d5

File tree

3 files changed

+73
-9
lines changed

3 files changed

+73
-9
lines changed

Lib/re/_compiler.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -250,19 +250,19 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
250250
while True:
251251
try:
252252
if op is LITERAL:
253-
if fixup:
254-
lo = fixup(av)
255-
charmap[lo] = 1
256-
if fixes and lo in fixes:
257-
for k in fixes[lo]:
253+
if fixup: # IGNORECASE and not LOCALE
254+
av = fixup(av)
255+
charmap[av] = 1
256+
if fixes and av in fixes:
257+
for k in fixes[av]:
258258
charmap[k] = 1
259259
if not hascased and iscased(av):
260260
hascased = True
261261
else:
262262
charmap[av] = 1
263263
elif op is RANGE:
264264
r = range(av[0], av[1]+1)
265-
if fixup:
265+
if fixup: # IGNORECASE and not LOCALE
266266
if fixes:
267267
for i in map(fixup, r):
268268
charmap[i] = 1
@@ -289,8 +289,7 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
289289
# Character set contains non-BMP character codes.
290290
# For range, all BMP characters in the range are already
291291
# proceeded.
292-
if fixup:
293-
hascased = True
292+
if fixup: # IGNORECASE and not LOCALE
294293
# For now, IN_UNI_IGNORE+LITERAL and
295294
# IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
296295
# characters, because two characters (at least one of
@@ -301,7 +300,13 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
301300
# Also, both c.lower() and c.lower().upper() are single
302301
# characters for every non-BMP character.
303302
if op is RANGE:
304-
op = RANGE_UNI_IGNORE
303+
if fixes: # not ASCII
304+
op = RANGE_UNI_IGNORE
305+
hascased = True
306+
else:
307+
assert op is LITERAL
308+
if not hascased and iscased(av):
309+
hascased = True
305310
tail.append((op, av))
306311
break
307312

Lib/test/test_re.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,6 +1073,39 @@ def test_ignore_case_set(self):
10731073
self.assertTrue(re.match(br'[19a]', b'a', re.I))
10741074
self.assertTrue(re.match(br'[19a]', b'A', re.I))
10751075
self.assertTrue(re.match(br'[19A]', b'a', re.I))
1076+
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I))
1077+
self.assertTrue(re.match(r'[19\xc7]', '\xe7', re.I))
1078+
self.assertTrue(re.match(r'[19\xe7]', '\xc7', re.I))
1079+
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I))
1080+
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I))
1081+
self.assertTrue(re.match(r'[19\u0400]', '\u0450', re.I))
1082+
self.assertTrue(re.match(r'[19\u0450]', '\u0400', re.I))
1083+
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I))
1084+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I))
1085+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010428', re.I))
1086+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010400', re.I))
1087+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I))
1088+
1089+
self.assertTrue(re.match(br'[19A]', b'A', re.I))
1090+
self.assertTrue(re.match(br'[19a]', b'a', re.I))
1091+
self.assertTrue(re.match(br'[19a]', b'A', re.I))
1092+
self.assertTrue(re.match(br'[19A]', b'a', re.I))
1093+
self.assertTrue(re.match(r'[19A]', 'A', re.I|re.A))
1094+
self.assertTrue(re.match(r'[19a]', 'a', re.I|re.A))
1095+
self.assertTrue(re.match(r'[19a]', 'A', re.I|re.A))
1096+
self.assertTrue(re.match(r'[19A]', 'a', re.I|re.A))
1097+
self.assertTrue(re.match(r'[19\xc7]', '\xc7', re.I|re.A))
1098+
self.assertIsNone(re.match(r'[19\xc7]', '\xe7', re.I|re.A))
1099+
self.assertIsNone(re.match(r'[19\xe7]', '\xc7', re.I|re.A))
1100+
self.assertTrue(re.match(r'[19\xe7]', '\xe7', re.I|re.A))
1101+
self.assertTrue(re.match(r'[19\u0400]', '\u0400', re.I|re.A))
1102+
self.assertIsNone(re.match(r'[19\u0400]', '\u0450', re.I|re.A))
1103+
self.assertIsNone(re.match(r'[19\u0450]', '\u0400', re.I|re.A))
1104+
self.assertTrue(re.match(r'[19\u0450]', '\u0450', re.I|re.A))
1105+
self.assertTrue(re.match(r'[19\U00010400]', '\U00010400', re.I|re.A))
1106+
self.assertIsNone(re.match(r'[19\U00010400]', '\U00010428', re.I|re.A))
1107+
self.assertIsNone(re.match(r'[19\U00010428]', '\U00010400', re.I|re.A))
1108+
self.assertTrue(re.match(r'[19\U00010428]', '\U00010428', re.I|re.A))
10761109

10771110
# Two different characters have the same lowercase.
10781111
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
@@ -1109,8 +1142,10 @@ def test_ignore_case_range(self):
11091142
self.assertTrue(re.match(br'[9-a]', b'_', re.I))
11101143
self.assertIsNone(re.match(br'[9-A]', b'_', re.I))
11111144
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
1145+
self.assertTrue(re.match(r'[\xc0-\xde]', '\xe7', re.I))
11121146
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
11131147
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I))
1148+
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xc7', re.I))
11141149
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
11151150
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I))
11161151
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0400', re.I))
@@ -1121,6 +1156,26 @@ def test_ignore_case_range(self):
11211156
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I))
11221157
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I))
11231158

1159+
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I|re.A))
1160+
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xe7', re.I|re.A))
1161+
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7', re.I|re.A))
1162+
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xc7', re.I|re.A))
1163+
self.assertTrue(re.match(r'[\u0430-\u045f]', '\u0450', re.I|re.A))
1164+
self.assertIsNone(re.match(r'[\u0430-\u045f]', '\u0400', re.I|re.A))
1165+
self.assertIsNone(re.match(r'[\u0400-\u042f]', '\u0450', re.I|re.A))
1166+
self.assertTrue(re.match(r'[\u0400-\u042f]', '\u0400', re.I|re.A))
1167+
self.assertTrue(re.match(r'[\U00010428-\U0001044f]', '\U00010428', re.I|re.A))
1168+
self.assertIsNone(re.match(r'[\U00010428-\U0001044f]', '\U00010400', re.I|re.A))
1169+
self.assertIsNone(re.match(r'[\U00010400-\U00010427]', '\U00010428', re.I|re.A))
1170+
self.assertTrue(re.match(r'[\U00010400-\U00010427]', '\U00010400', re.I|re.A))
1171+
1172+
self.assertTrue(re.match(r'[N-\x7f]', 'A', re.I|re.A))
1173+
self.assertTrue(re.match(r'[n-\x7f]', 'Z', re.I|re.A))
1174+
self.assertTrue(re.match(r'[N-\uffff]', 'A', re.I|re.A))
1175+
self.assertTrue(re.match(r'[n-\uffff]', 'Z', re.I|re.A))
1176+
self.assertTrue(re.match(r'[N-\U00010000]', 'A', re.I|re.A))
1177+
self.assertTrue(re.match(r'[n-\U00010000]', 'Z', re.I|re.A))
1178+
11241179
# Two different characters have the same lowercase.
11251180
assert 'K'.lower() == '\u212a'.lower() == 'k' # 'K'
11261181
self.assertTrue(re.match(r'[J-M]', '\u212a', re.I))
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix bugs in compiling case-insensitive :mod:`regular expressions <re>` with
2+
character classes containing non-BMP characters: upper-case non-BMP
3+
character did was ignored and the ASCII flag was ignored when
4+
matching a character range whose upper bound is beyond the BMP region.

0 commit comments

Comments
 (0)