Skip to content

Cleanup unicode table gen #144134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/tools/unicode-table-generator/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "unicode-table-generator"
version = "0.1.0"
edition = "2021"
edition = "2024"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

Expand Down
14 changes: 6 additions & 8 deletions src/tools/unicode-table-generator/src/cascading_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ impl RawEmitter {

let points = ranges
.iter()
.flat_map(|r| (r.start..r.end).into_iter().collect::<Vec<u32>>())
.flat_map(|r| (r.start..r.end).collect::<Vec<u32>>())
.collect::<Vec<u32>>();

println!("there are {} points", points.len());
Expand All @@ -32,30 +32,28 @@ impl RawEmitter {
// assert that there is no whitespace over the 0x3000 range.
assert!(point <= 0x3000, "the highest unicode whitespace value has changed");
let high_bytes = point as usize >> 8;
let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_insert_with(Vec::new);
let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_default();
codepoints.push(point);
}

let mut bit_for_high_byte = 1u8;
let mut arms = Vec::<String>::new();

let mut high_bytes: Vec<usize> =
codepoints_by_high_bytes.keys().map(|k| k.clone()).collect();
let mut high_bytes: Vec<usize> = codepoints_by_high_bytes.keys().copied().collect();
high_bytes.sort();
for high_byte in high_bytes {
let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap();
if codepoints.len() == 1 {
let ch = codepoints.pop().unwrap();
arms.push(format!("{} => c as u32 == {:#04x}", high_byte, ch));
arms.push(format!("{high_byte} => c as u32 == {ch:#04x}"));
continue;
}
// more than 1 codepoint in this arm
for codepoint in codepoints {
map[(*codepoint & 0xff) as usize] |= bit_for_high_byte;
}
arms.push(format!(
"{} => WHITESPACE_MAP[c as usize & 0xff] & {} != 0",
high_byte, bit_for_high_byte
"{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0"
));
bit_for_high_byte <<= 1;
}
Expand All @@ -68,7 +66,7 @@ impl RawEmitter {
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
for arm in arms {
writeln!(&mut self.file, " {},", arm).unwrap();
writeln!(&mut self.file, " {arm},").unwrap();
}
writeln!(&mut self.file, " _ => false,").unwrap();
writeln!(&mut self.file, " }}").unwrap();
Expand Down
2 changes: 1 addition & 1 deletion src/tools/unicode-table-generator/src/case_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const INDEX_MASK: u32 = 1 << 22;
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
let mut file = String::new();

write!(file, "const INDEX_MASK: u32 = 0x{:x};", INDEX_MASK).unwrap();
write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap();
file.push_str("\n\n");
file.push_str(HEADER.trim_start());
file.push('\n');
Expand Down
40 changes: 20 additions & 20 deletions src/tools/unicode-table-generator/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,15 @@ fn load_data() -> UnicodeData {
.push(Codepoints::Single(row.codepoint));
}

if let Some(mapped) = row.simple_lowercase_mapping {
if mapped != row.codepoint {
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
if let Some(mapped) = row.simple_lowercase_mapping
&& mapped != row.codepoint
{
to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
if let Some(mapped) = row.simple_uppercase_mapping {
if mapped != row.codepoint {
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
if let Some(mapped) = row.simple_uppercase_mapping
&& mapped != row.codepoint
{
to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0));
}
}

Expand Down Expand Up @@ -196,12 +196,12 @@ fn load_data() -> UnicodeData {
.flat_map(|codepoints| match codepoints {
Codepoints::Single(c) => c
.scalar()
.map(|ch| (ch as u32..ch as u32 + 1))
.map(|ch| ch as u32..ch as u32 + 1)
.into_iter()
.collect::<Vec<_>>(),
Codepoints::Range(c) => c
.into_iter()
.flat_map(|c| c.scalar().map(|ch| (ch as u32..ch as u32 + 1)))
.flat_map(|c| c.scalar().map(|ch| ch as u32..ch as u32 + 1))
.collect::<Vec<_>>(),
})
.collect::<Vec<Range<u32>>>(),
Expand Down Expand Up @@ -236,7 +236,7 @@ fn main() {
let ranges_by_property = &unicode_data.ranges;

if let Some(path) = test_path {
std::fs::write(&path, generate_tests(&write_location, &ranges_by_property)).unwrap();
std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap();
}

let mut total_bytes = 0;
Expand All @@ -246,9 +246,9 @@ fn main() {

let mut emitter = RawEmitter::new();
if property == &"White_Space" {
emit_whitespace(&mut emitter, &ranges);
emit_whitespace(&mut emitter, ranges);
} else {
emit_codepoints(&mut emitter, &ranges);
emit_codepoints(&mut emitter, ranges);
}

modules.push((property.to_lowercase().to_string(), emitter.file));
Expand Down Expand Up @@ -288,7 +288,7 @@ fn main() {
for line in contents.lines() {
if !line.trim().is_empty() {
table_file.push_str(" ");
table_file.push_str(&line);
table_file.push_str(line);
}
table_file.push('\n');
}
Expand All @@ -312,15 +312,15 @@ fn version() -> String {
let start = readme.find(prefix).unwrap() + prefix.len();
let end = readme.find(" of the Unicode Standard.").unwrap();
let version =
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(&v)).collect::<Vec<_>>();
readme[start..end].split('.').map(|v| v.parse::<u32>().expect(v)).collect::<Vec<_>>();
let [major, minor, micro] = [version[0], version[1], version[2]];

out.push_str(&format!("({major}, {minor}, {micro});\n"));
out
}

fn fmt_list<V: std::fmt::Debug>(values: impl IntoIterator<Item = V>) -> String {
let pieces = values.into_iter().map(|b| format!("{:?}, ", b)).collect::<Vec<_>>();
let pieces = values.into_iter().map(|b| format!("{b:?}, ")).collect::<Vec<_>>();
let mut out = String::new();
let mut line = String::from("\n ");
for piece in pieces {
Expand Down Expand Up @@ -348,7 +348,7 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
s.push_str("\nfn main() {\n");

for (property, ranges) in ranges {
s.push_str(&format!(r#" println!("Testing {}");"#, property));
s.push_str(&format!(r#" println!("Testing {property}");"#));
s.push('\n');
s.push_str(&format!(" {}_true();\n", property.to_lowercase()));
s.push_str(&format!(" {}_false();\n", property.to_lowercase()));
Expand All @@ -373,7 +373,7 @@ fn generate_tests(data_path: &str, ranges: &[(&str, Vec<Range<u32>>)]) -> String
s.push_str(" }\n\n");
}

s.push_str("}");
s.push('}');
s
}

Expand All @@ -388,7 +388,7 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool
range.start,
));
} else {
s.push_str(&format!(" for chn in {:?}u32 {{\n", range));
s.push_str(&format!(" for chn in {range:?}u32 {{\n"));
s.push_str(&format!(
" assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n",
if truthy { "" } else { "!" },
Expand Down Expand Up @@ -439,7 +439,7 @@ fn merge_ranges(ranges: &mut Vec<Range<u32>>) {
let mut last_end = None;
for range in ranges {
if let Some(last) = last_end {
assert!(range.start > last, "{:?}", range);
assert!(range.start > last, "{range:?}");
}
last_end = Some(range.end);
}
Expand Down
25 changes: 10 additions & 15 deletions src/tools/unicode-table-generator/src/raw_emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,10 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
emitter.blank_line();

let mut bitset = emitter.clone();
let bitset_ok = bitset.emit_bitset(&ranges).is_ok();
let bitset_ok = bitset.emit_bitset(ranges).is_ok();

let mut skiplist = emitter.clone();
skiplist.emit_skiplist(&ranges);
skiplist.emit_skiplist(ranges);

if bitset_ok && bitset.bytes_used <= skiplist.bytes_used {
*emitter = bitset;
Expand All @@ -174,7 +174,7 @@ pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range<u32>]) {
emitter.blank_line();

let mut cascading = emitter.clone();
cascading.emit_cascading_map(&ranges);
cascading.emit_cascading_map(ranges);
*emitter = cascading;
emitter.desc = String::from("cascading");
}
Expand Down Expand Up @@ -272,7 +272,7 @@ impl Canonicalized {
// for canonical when possible.
while let Some((&to, _)) = mappings
.iter()
.find(|(&to, _)| to == 0)
.find(|&(&to, _)| to == 0)
.or_else(|| mappings.iter().max_by_key(|m| m.1.len()))
{
// Get the mapping with the most entries. Currently, no mapping can
Expand Down Expand Up @@ -311,10 +311,9 @@ impl Canonicalized {
}
}
}
assert!(
unique_mapping
.insert(to, UniqueMapping::Canonical(canonical_words.len()))
.is_none()
assert_eq!(
unique_mapping.insert(to, UniqueMapping::Canonical(canonical_words.len())),
None
);
canonical_words.push(to);

Expand All @@ -340,14 +339,10 @@ impl Canonicalized {
// We'll probably always have some slack though so this loop will still
// be needed.
for &w in unique_words {
if !unique_mapping.contains_key(&w) {
assert!(
unique_mapping
.insert(w, UniqueMapping::Canonical(canonical_words.len()))
.is_none()
);
unique_mapping.entry(w).or_insert_with(|| {
canonical_words.push(w);
}
UniqueMapping::Canonical(canonical_words.len())
});
}
assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len());
assert_eq!(unique_mapping.len(), unique_words.len());
Expand Down
Loading