Skip to content

Commit ab29d95

Browse files
gnpricelisroach
authored andcommitted
bpo-37760: Factor out standard range-expanding logic in makeunicodedata. (pythonGH-15248)
Much like the lower-level logic in commit ef2af1a, we had 4 copies of this logic, written in a couple of different ways. They're all implementing the same standard, so write it just once.
1 parent 786dbba commit ab29d95

File tree

1 file changed

+35
-33
lines changed

1 file changed

+35
-33
lines changed

Tools/unicode/makeunicodedata.py

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
from functools import partial
3434
from textwrap import dedent
35-
from typing import *
35+
from typing import Iterator, List, Tuple
3636

3737
SCRIPT = sys.argv[0]
3838
VERSION = "3.3"
@@ -904,6 +904,19 @@ def open_data(template, version):
904904
return open(local, 'rb')
905905

906906

907+
def expand_range(char_range: str) -> Iterator[int]:
908+
'''
909+
Parses ranges of code points, as described in UAX #44:
910+
https://www.unicode.org/reports/tr44/#Code_Point_Ranges
911+
'''
912+
if '..' in char_range:
913+
first, last = [int(c, 16) for c in char_range.split('..')]
914+
else:
915+
first = last = int(char_range, 16)
916+
for char in range(first, last+1):
917+
yield char
918+
919+
907920
class UcdFile:
908921
'''
909922
A file in the standard format of the UCD.
@@ -929,6 +942,12 @@ def records(self) -> Iterator[List[str]]:
929942
def __iter__(self) -> Iterator[List[str]]:
930943
return self.records()
931944

945+
def expanded(self) -> Iterator[Tuple[int, List[str]]]:
946+
for record in self.records():
947+
char_range, rest = record[0], record[1:]
948+
for char in expand_range(char_range):
949+
yield char, rest
950+
932951

933952
# --------------------------------------------------------------------
934953
# the following support code is taken from the unidb utilities
@@ -955,6 +974,9 @@ def __init__(self, version, cjk_check=True):
955974
# expand first-last ranges
956975
field = None
957976
for i in range(0, 0x110000):
977+
# The file UnicodeData.txt has its own distinct way of
978+
# expressing ranges. See:
979+
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
958980
s = table[i]
959981
if s:
960982
if s[1][-6:] == "First>":
@@ -1019,14 +1041,8 @@ def __init__(self, version, cjk_check=True):
10191041
self.exclusions[char] = 1
10201042

10211043
widths = [None] * 0x110000
1022-
for s in UcdFile(EASTASIAN_WIDTH, version):
1023-
if '..' in s[0]:
1024-
first, last = [int(c, 16) for c in s[0].split('..')]
1025-
chars = list(range(first, last+1))
1026-
else:
1027-
chars = [int(s[0], 16)]
1028-
for char in chars:
1029-
widths[char] = s[1]
1044+
for char, (width,) in UcdFile(EASTASIAN_WIDTH, version).expanded():
1045+
widths[char] = width
10301046

10311047
for i in range(0, 0x110000):
10321048
if table[i] is not None:
@@ -1036,26 +1052,16 @@ def __init__(self, version, cjk_check=True):
10361052
if table[i] is not None:
10371053
table[i].append(set())
10381054

1039-
for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
1040-
if ".." in r:
1041-
first, last = [int(c, 16) for c in r.split('..')]
1042-
chars = list(range(first, last+1))
1043-
else:
1044-
chars = [int(r, 16)]
1045-
for char in chars:
1046-
if table[char]:
1047-
# Some properties (e.g. Default_Ignorable_Code_Point)
1048-
# apply to unassigned code points; ignore them
1049-
table[char][-1].add(p)
1050-
1051-
for s in UcdFile(LINE_BREAK, version):
1052-
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1055+
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
1056+
if table[char]:
1057+
# Some properties (e.g. Default_Ignorable_Code_Point)
1058+
# apply to unassigned code points; ignore them
1059+
table[char][-1].add(p)
1060+
1061+
for char_range, value in UcdFile(LINE_BREAK, version):
1062+
if value not in MANDATORY_LINE_BREAKS:
10531063
continue
1054-
if '..' not in s[0]:
1055-
first = last = int(s[0], 16)
1056-
else:
1057-
first, last = [int(c, 16) for c in s[0].split('..')]
1058-
for char in range(first, last+1):
1064+
for char in expand_range(char_range):
10591065
table[char][-1].add('Line_Break')
10601066

10611067
# We only want the quickcheck properties
@@ -1073,11 +1079,7 @@ def __init__(self, version, cjk_check=True):
10731079
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
10741080
quickcheck_shift = qc_order.index(s[1])*2
10751081
quickcheck <<= quickcheck_shift
1076-
if '..' not in s[0]:
1077-
first = last = int(s[0], 16)
1078-
else:
1079-
first, last = [int(c, 16) for c in s[0].split('..')]
1080-
for char in range(first, last+1):
1082+
for char in expand_range(s[0]):
10811083
assert not (quickchecks[char]>>quickcheck_shift)&3
10821084
quickchecks[char] |= quickcheck
10831085
for i in range(0, 0x110000):

0 commit comments

Comments
 (0)