32
32
33
33
from functools import partial
34
34
from textwrap import dedent
35
- from typing import *
35
+ from typing import Iterator , List , Tuple
36
36
37
37
SCRIPT = sys .argv [0 ]
38
38
VERSION = "3.3"
@@ -904,6 +904,19 @@ def open_data(template, version):
904
904
return open (local , 'rb' )
905
905
906
906
907
+ def expand_range (char_range : str ) -> Iterator [int ]:
908
+ '''
909
+ Parses ranges of code points, as described in UAX #44:
910
+ https://www.unicode.org/reports/tr44/#Code_Point_Ranges
911
+ '''
912
+ if '..' in char_range :
913
+ first , last = [int (c , 16 ) for c in char_range .split ('..' )]
914
+ else :
915
+ first = last = int (char_range , 16 )
916
+ for char in range (first , last + 1 ):
917
+ yield char
918
+
919
+
907
920
class UcdFile :
908
921
'''
909
922
A file in the standard format of the UCD.
@@ -929,6 +942,12 @@ def records(self) -> Iterator[List[str]]:
929
942
def __iter__ (self ) -> Iterator [List [str ]]:
930
943
return self .records ()
931
944
945
+ def expanded (self ) -> Iterator [Tuple [int , List [str ]]]:
946
+ for record in self .records ():
947
+ char_range , rest = record [0 ], record [1 :]
948
+ for char in expand_range (char_range ):
949
+ yield char , rest
950
+
932
951
933
952
# --------------------------------------------------------------------
934
953
# the following support code is taken from the unidb utilities
@@ -955,6 +974,9 @@ def __init__(self, version, cjk_check=True):
955
974
# expand first-last ranges
956
975
field = None
957
976
for i in range (0 , 0x110000 ):
977
+ # The file UnicodeData.txt has its own distinct way of
978
+ # expressing ranges. See:
979
+ # https://www.unicode.org/reports/tr44/#Code_Point_Ranges
958
980
s = table [i ]
959
981
if s :
960
982
if s [1 ][- 6 :] == "First>" :
@@ -1019,14 +1041,8 @@ def __init__(self, version, cjk_check=True):
1019
1041
self .exclusions [char ] = 1
1020
1042
1021
1043
widths = [None ] * 0x110000
1022
- for s in UcdFile (EASTASIAN_WIDTH , version ):
1023
- if '..' in s [0 ]:
1024
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1025
- chars = list (range (first , last + 1 ))
1026
- else :
1027
- chars = [int (s [0 ], 16 )]
1028
- for char in chars :
1029
- widths [char ] = s [1 ]
1044
+ for char , (width ,) in UcdFile (EASTASIAN_WIDTH , version ).expanded ():
1045
+ widths [char ] = width
1030
1046
1031
1047
for i in range (0 , 0x110000 ):
1032
1048
if table [i ] is not None :
@@ -1036,26 +1052,16 @@ def __init__(self, version, cjk_check=True):
1036
1052
if table [i ] is not None :
1037
1053
table [i ].append (set ())
1038
1054
1039
- for r , p in UcdFile (DERIVED_CORE_PROPERTIES , version ):
1040
- if ".." in r :
1041
- first , last = [int (c , 16 ) for c in r .split ('..' )]
1042
- chars = list (range (first , last + 1 ))
1043
- else :
1044
- chars = [int (r , 16 )]
1045
- for char in chars :
1046
- if table [char ]:
1047
- # Some properties (e.g. Default_Ignorable_Code_Point)
1048
- # apply to unassigned code points; ignore them
1049
- table [char ][- 1 ].add (p )
1050
-
1051
- for s in UcdFile (LINE_BREAK , version ):
1052
- if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1055
+ for char , (p ,) in UcdFile (DERIVED_CORE_PROPERTIES , version ).expanded ():
1056
+ if table [char ]:
1057
+ # Some properties (e.g. Default_Ignorable_Code_Point)
1058
+ # apply to unassigned code points; ignore them
1059
+ table [char ][- 1 ].add (p )
1060
+
1061
+ for char_range , value in UcdFile (LINE_BREAK , version ):
1062
+ if value not in MANDATORY_LINE_BREAKS :
1053
1063
continue
1054
- if '..' not in s [0 ]:
1055
- first = last = int (s [0 ], 16 )
1056
- else :
1057
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1058
- for char in range (first , last + 1 ):
1064
+ for char in expand_range (char_range ):
1059
1065
table [char ][- 1 ].add ('Line_Break' )
1060
1066
1061
1067
# We only want the quickcheck properties
@@ -1073,11 +1079,7 @@ def __init__(self, version, cjk_check=True):
1073
1079
quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
1074
1080
quickcheck_shift = qc_order .index (s [1 ])* 2
1075
1081
quickcheck <<= quickcheck_shift
1076
- if '..' not in s [0 ]:
1077
- first = last = int (s [0 ], 16 )
1078
- else :
1079
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1080
- for char in range (first , last + 1 ):
1082
+ for char in expand_range (s [0 ]):
1081
1083
assert not (quickchecks [char ]>> quickcheck_shift )& 3
1082
1084
quickchecks [char ] |= quickcheck
1083
1085
for i in range (0 , 0x110000 ):
0 commit comments