From 6fa8898b537fa54838ad46a406e2acd761ff06ad Mon Sep 17 00:00:00 2001 From: Arlie Davis Date: Fri, 18 Dec 2020 11:37:56 -0800 Subject: [PATCH] Split Symbol interner into static and dynamic This improves performance of Symbol interning in several ways. The main motivation of this work is to prepare rustc for efficient parallel builds. The Symbol lookup table (mapping from strings to Symbol numbers) is now split into two separate tables: a static table and a dynamic table. The static table contains strings that are known to rustc, including all keywords and all `sym::foo` symbols. The dynamic table contains strings that rustc discovers while compiling, such as "my_super_obscure_function_name". Since the static table is known at compile time (that is, when rustc itself is being compiled), this table can be stored entirely in static data structures. We use the `phf` crate to generate this table; `phf` generates perfect hash functions. This allows rustc to perform Symbol lookups for static symbols without any multithreaded synchronization, or accessing any dynamic data whatsoever. I measured the percentage of static symbol lookups in many common Rust crates, including rust/compiler, rust/library, servo, rand, quote, syn, rust-analyzer, rayon, and rsvg. Among these crates, between 35% and 55% of all symbol lookups were resolved using the static lookup table. --- Cargo.lock | 2 + compiler/rustc_macros/Cargo.toml | 1 + compiler/rustc_macros/src/symbols.rs | 92 +++++++++++++++---------- compiler/rustc_span/Cargo.toml | 1 + compiler/rustc_span/src/lib.rs | 2 +- compiler/rustc_span/src/symbol.rs | 87 +++++++++++++++++++---- compiler/rustc_span/src/symbol/tests.rs | 78 +++++++++++++++++++-- src/tools/tidy/src/deps.rs | 5 ++ 8 files changed, 212 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2c06a2adb14e..f52ae7340aad7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3913,6 +3913,7 @@ dependencies = [ name = "rustc_macros" version = "0.1.0" dependencies = [ + "phf_codegen", "proc-macro2", "quote", "syn", @@ -4200,6 +4201,7 @@ version = "0.0.0" dependencies = [ "cfg-if 0.1.10", "md-5", + "phf", "rustc_arena", "rustc_data_structures", "rustc_index", diff --git a/compiler/rustc_macros/Cargo.toml b/compiler/rustc_macros/Cargo.toml index 73eb0dd56d772..b123af8c172e2 100644 --- a/compiler/rustc_macros/Cargo.toml +++ b/compiler/rustc_macros/Cargo.toml @@ -12,3 +12,4 @@ synstructure = "0.12.1" syn = { version = "1", features = ["full"] } proc-macro2 = "1" quote = "1" +phf_codegen = "0.8" diff --git a/compiler/rustc_macros/src/symbols.rs b/compiler/rustc_macros/src/symbols.rs index 5b932864dff5d..4a12a3312b99e 100644 --- a/compiler/rustc_macros/src/symbols.rs +++ b/compiler/rustc_macros/src/symbols.rs @@ -124,10 +124,6 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec) { } }; - let mut keyword_stream = quote! {}; - let mut symbols_stream = quote! {}; - let mut prefill_stream = quote! {}; - let mut counter = 0u32; let mut keys = HashMap::::with_capacity(input.keywords.len() + input.symbols.len() + 10); let mut prev_key: Option<(Span, String)> = None; @@ -136,8 +132,10 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec) { if let Some(prev_span) = keys.get(str) { errors.error(span, format!("Symbol `{}` is duplicated", str)); errors.error(*prev_span, format!("location of previous definition")); + Err(()) } else { keys.insert(str.to_string(), span); + Ok(()) } }; @@ -151,51 +149,75 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec) { prev_key = Some((span, str.to_string())); }; + let mut symbol_strings: Vec = Vec::new(); + // Generate the listed keywords. + let mut keyword_stream = quote! {}; for keyword in input.keywords.iter() { let name = &keyword.name; let value = &keyword.value; let value_string = value.value(); - check_dup(keyword.name.span(), &value_string, &mut errors); - prefill_stream.extend(quote! { - #value, - }); + let symbol_index = symbol_strings.len() as u32; + if check_dup(keyword.name.span(), &value_string, &mut errors).is_ok() { + // Only add an entry to `symbol_strings` if it is not a duplicate. + // If it is a duplicate, then compilation will fail. However, we still + // want to avoid panicking, if a duplicate is detected. + symbol_strings.push(value_string); + } keyword_stream.extend(quote! { - pub const #name: Symbol = Symbol::new(#counter); + pub const #name: Symbol = Symbol::new(#symbol_index); }); - counter += 1; + } + + // Generate symbols for the strings "0", "1", ..., "9". + let digits_base = symbol_strings.len() as u32; + for n in 0..10 { + let n_string = n.to_string(); + if check_dup(Span::call_site(), &n_string, &mut errors).is_ok() { + symbol_strings.push(n_string); + } } // Generate the listed symbols. + let mut symbols_stream = quote! {}; for symbol in input.symbols.iter() { let name = &symbol.name; + let name_string = symbol.name.to_string(); + check_order(symbol.name.span(), &name_string, &mut errors); let value = match &symbol.value { Some(value) => value.value(), - None => name.to_string(), + None => name_string, }; - check_dup(symbol.name.span(), &value, &mut errors); - check_order(symbol.name.span(), &name.to_string(), &mut errors); - prefill_stream.extend(quote! { - #value, - }); + let symbol_index = symbol_strings.len() as u32; + if check_dup(symbol.name.span(), &value, &mut errors).is_ok() { + // Only add an entry to `symbol_strings` if it is not a duplicate. + // If it is a duplicate, then compilation will fail. However, we still + // want to avoid panicking, if a duplicate is detected. + symbol_strings.push(value); + } + symbols_stream.extend(quote! { - pub const #name: Symbol = Symbol::new(#counter); + pub const #name: Symbol = Symbol::new(#symbol_index); }); - counter += 1; } - // Generate symbols for the strings "0", "1", ..., "9". - let digits_base = counter; - counter += 10; - for n in 0..10 { - let n = n.to_string(); - check_dup(Span::call_site(), &n, &mut errors); - prefill_stream.extend(quote! { - #n, - }); + // We have finished collecting symbol strings. + let static_symbols_len = symbol_strings.len(); + let dynamic_symbol_base = symbol_strings.len() as u32; + let symbol_strings = symbol_strings; // no more mutation + + // Build the body of STATIC_SYMBOLS. + let symbol_strings_tokens: TokenStream = symbol_strings.iter().map(|s| quote!(#s,)).collect(); + + // Build the PHF map. This translates from strings to Symbol values. + let mut phf_map = phf_codegen::Map::<&str>::new(); + for (symbol_index, symbol) in symbol_strings.iter().enumerate() { + phf_map.entry(symbol, format!("Symbol::new({})", symbol_index as u32).as_str()); } - let _ = counter; // for future use + let phf_map_built = phf_map.build(); + let phf_map_text = phf_map_built.to_string(); + let phf_map_expr = syn::parse_str::(&phf_map_text).unwrap(); let output = quote! { const SYMBOL_DIGITS_BASE: u32 = #digits_base; @@ -215,13 +237,13 @@ fn symbols_with_errors(input: TokenStream) -> (TokenStream, Vec) { #symbols_stream } - impl Interner { - pub fn fresh() -> Self { - Interner::prefill(&[ - #prefill_stream - ]) - } - } + const DYNAMIC_SYMBOL_BASE: u32 = #dynamic_symbol_base; + + static STATIC_SYMBOLS: [&str; #static_symbols_len as usize] = [ + #symbol_strings_tokens + ]; + + static STATIC_SYMBOLS_PHF: ::phf::Map<&'static str, Symbol> = #phf_map_expr; }; (output, errors.list) diff --git a/compiler/rustc_span/Cargo.toml b/compiler/rustc_span/Cargo.toml index 08645990c4870..90b3c10f804dc 100644 --- a/compiler/rustc_span/Cargo.toml +++ b/compiler/rustc_span/Cargo.toml @@ -20,3 +20,4 @@ tracing = "0.1" sha-1 = "0.9" sha2 = "0.9" md-5 = "0.9" +phf = "0.8" diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index 50cb15554864f..143d73ab61249 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -90,7 +90,7 @@ pub struct SessionGlobals { impl SessionGlobals { pub fn new(edition: Edition) -> SessionGlobals { SessionGlobals { - symbol_interner: Lock::new(symbol::Interner::fresh()), + symbol_interner: Lock::new(symbol::Interner::default()), span_interner: Lock::new(span_encoding::SpanInterner::default()), hygiene_data: Lock::new(hygiene::HygieneData::new(edition)), source_map: Lock::new(None), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 7b90e5b611cd1..8839897fcb3cd 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1452,17 +1452,55 @@ impl Symbol { Symbol(SymbolIndex::from_u32(n)) } + /// Maps a string to its interned representation, but only if this string is a known + /// (static) symbol. + pub fn intern_static(string: &str) -> Option { + if let Some(symbol) = STATIC_SYMBOLS_PHF.get(string) { Some(*symbol) } else { None } + } + /// Maps a string to its interned representation. + // #[inline(never)] - There is no benefit to inlining this function (verified with + // performance measurements), and a reduction in overall code size by disabling inlining. + #[inline(never)] pub fn intern(string: &str) -> Self { - with_interner(|interner| interner.intern(string)) + if let Some(symbol) = Symbol::intern_static(string) { + symbol + } else { + with_interner(|interner| interner.intern_dynamic(string)) + } + } + + pub fn is_static(self) -> bool { + self.0.as_u32() < DYNAMIC_SYMBOL_BASE + } + + /// Translates the `Symbol` to a string, but only if this `Symbol` + /// was originally interned as a static symbol. + pub fn as_str_static(self) -> Option<&'static str> { + let symbol_index = self.0.as_usize(); + if symbol_index < STATIC_SYMBOLS.len() { + // This is a well-known symbol. The symbol string is stored in a static field. + // There is no need to lock the interner. + Some(STATIC_SYMBOLS[symbol_index]) + } else { + None + } } /// Convert to a `SymbolStr`. This is a slowish operation because it /// requires locking the symbol interner. + /// + /// If the symbol is a statically-interned symbol (interned at rustc compile time), + /// then this operation is fast, and does not acquire any locks. pub fn as_str(self) -> SymbolStr { - with_interner(|interner| unsafe { - SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get(self)) } - }) + if let Some(string) = self.as_str_static() { + SymbolStr { string } + } else { + // This is a dynamic string. The string is stored in the Interner. + with_interner(|interner| unsafe { + SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get_dynamic(self)) } + }) + } } pub fn as_u32(self) -> u32 { @@ -1528,6 +1566,13 @@ impl ToStableHashKey for Symbol { // The `FxHashMap`+`Vec` pair could be replaced by `FxIndexSet`, but #75278 // found that to regress performance up to 2% in some cases. This might be // revisited after further improvements to `indexmap`. +// +// `Interner` does not contain any of the statically-known symbol names. +// It does not contain any of the strings defined in the `Keyword` or +// `Symbol` sections. Since those strings are statically-known, we just +// look them up in a (static) table, when needed. See +// `STATIC_SYMBOLS` and `STATIC_SYMBOLS_PHF`, which are both generated by +// `compiler/rustc_macros/src/symbols.rs`. #[derive(Default)] pub struct Interner { arena: DroplessArena, @@ -1536,21 +1581,28 @@ pub struct Interner { } impl Interner { - fn prefill(init: &[&'static str]) -> Self { - Interner { - strings: init.into(), - names: init.iter().copied().zip((0..).map(Symbol::new)).collect(), - ..Default::default() + pub fn intern(&mut self, string: &str) -> Symbol { + if let Some(sym) = Symbol::intern_static(string) { + sym + } else { + self.intern_dynamic(string) } } - #[inline] - pub fn intern(&mut self, string: &str) -> Symbol { + fn intern_dynamic(&mut self, string: &str) -> Symbol { + // The caller should have already checked for static symbols. + // Failure to do so is a bug, since this code will mistakenly + // intern the static symbol, resulting in a bogus symbol index. + // (The whole point of this design is that you can do static + // lookups without acquiring the thread-local Interner, so if + // we got here with a static symbol, we goofed.) + debug_assert!(Symbol::intern_static(string).is_none()); + if let Some(&name) = self.names.get(string) { return name; } - let name = Symbol::new(self.strings.len() as u32); + let name = Symbol::new(DYNAMIC_SYMBOL_BASE + self.strings.len() as u32); // `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be // UTF-8. @@ -1567,7 +1619,16 @@ impl Interner { // Get the symbol as a string. `Symbol::as_str()` should be used in // preference to this function. pub fn get(&self, symbol: Symbol) -> &str { - self.strings[symbol.0.as_usize()] + if let Some(string) = symbol.as_str_static() { + string + } else { + &self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize] + } + } + + fn get_dynamic(&self, symbol: Symbol) -> &str { + debug_assert!(!symbol.is_static()); + self.strings[(symbol.as_u32() - DYNAMIC_SYMBOL_BASE) as usize] } } diff --git a/compiler/rustc_span/src/symbol/tests.rs b/compiler/rustc_span/src/symbol/tests.rs index 47da03424b770..f4102089ddb9d 100644 --- a/compiler/rustc_span/src/symbol/tests.rs +++ b/compiler/rustc_span/src/symbol/tests.rs @@ -5,15 +5,14 @@ use crate::{edition, SessionGlobals}; #[test] fn interner_tests() { let mut i: Interner = Interner::default(); - // first one is zero: - assert_eq!(i.intern("dog"), Symbol::new(0)); + let dog = i.intern("dog"); // re-use gets the same entry: - assert_eq!(i.intern("dog"), Symbol::new(0)); + assert_eq!(i.intern("dog").as_u32(), dog.as_u32()); // different string gets a different #: - assert_eq!(i.intern("cat"), Symbol::new(1)); - assert_eq!(i.intern("cat"), Symbol::new(1)); - // dog is still at zero - assert_eq!(i.intern("dog"), Symbol::new(0)); + let cat = i.intern("cat"); + assert_ne!(dog.as_u32(), cat.as_u32()); + assert_eq!(i.intern("cat").as_u32(), cat.as_u32()); + assert_eq!(i.intern("dog").as_u32(), dog.as_u32()); } #[test] @@ -23,3 +22,68 @@ fn without_first_quote_test() { assert_eq!(i.without_first_quote().name, kw::Break); }); } + +#[test] +fn test_static_symbols() { + assert_eq!(Symbol::intern_static(""), Some(kw::Invalid)); + assert_eq!(Symbol::intern_static("not in the static table"), None); + assert!(Symbol::intern_static("fn").is_some()); // don't care about exact index + + // check round-tripping + for &string in ["as", "fn", "let", "trait", "size_of_val"].iter() { + let sym = Symbol::intern_static(string).unwrap(); + assert_eq!(string, &*sym.as_str(), "sym #{}", sym.0.as_u32()); + } +} + +#[test] +fn test_ident_is_special() { + for &s in [kw::Invalid, kw::PathRoot, kw::DollarCrate, kw::Underscore].iter() { + let ident = Ident::with_dummy_span(s); + assert_eq!(ident.is_special(), true, "s = {:?}", s); + } + + for &s in [kw::As, kw::Break, kw::UnderscoreLifetime].iter() { + let ident = Ident::with_dummy_span(s); + assert_eq!(ident.is_special(), false, "s = {:?}", s); + } +} + +#[test] +fn test_symbol_as_str() { + SESSION_GLOBALS.set(&SessionGlobals::new(edition::Edition::Edition2018), || { + for &(sym, string) in [ + (kw::Invalid, ""), + (kw::PathRoot, "{{root}}"), + (kw::DollarCrate, "$crate"), + (kw::As, "as"), + (kw::Break, "break"), + (kw::While, "while"), + (kw::Union, "union"), + (sym::Alignment, "Alignment"), + (sym::Arc, "Arc"), + (sym::zmm_reg, "zmm_reg"), + (sym::i64, "i64"), + ] + .iter() + { + let as_str = sym.as_str(); + assert_eq!(&*as_str, string); + + let sym2 = Symbol::intern(string); + assert_eq!(sym, sym2, "sym={} sym2={}", sym.as_u32(), sym2.as_u32()); + } + + let colon = Symbol::intern(":"); + assert_eq!(&*colon.as_str(), ":"); + }); +} + +#[test] +fn test_dynamic_symbols() { + crate::with_session_globals(crate::edition::Edition::Edition2018, || { + let s1 = Symbol::intern("fuzzy wuzzy"); + assert!(!s1.is_static()); + assert_eq!(&*s1.as_str(), "fuzzy wuzzy"); + }); +} diff --git a/src/tools/tidy/src/deps.rs b/src/tools/tidy/src/deps.rs index 4b521985ca1ad..733dc1e67fd30 100644 --- a/src/tools/tidy/src/deps.rs +++ b/src/tools/tidy/src/deps.rs @@ -132,6 +132,10 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "parking_lot", "parking_lot_core", "pathdiff", + "phf", + "phf_codegen", + "phf_generator", + "phf_shared", "pkg-config", "polonius-engine", "ppv-lite86", @@ -163,6 +167,7 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "serde_derive", "sha-1", "sha2", + "siphasher", "smallvec", "snap", "stable_deref_trait",