diff --git a/analyzeme/Cargo.toml b/analyzeme/Cargo.toml index 83072ae..dd7b0e4 100644 --- a/analyzeme/Cargo.toml +++ b/analyzeme/Cargo.toml @@ -6,6 +6,8 @@ edition = "2018" license = "MIT OR Apache-2.0" [dependencies] +byteorder = "1.2.7" +memchr = "2" measureme = { path = "../measureme" } rustc-hash = "1.0.1" serde = { version = "1.0", features = [ "derive" ] } diff --git a/analyzeme/src/lib.rs b/analyzeme/src/lib.rs index 3782ebf..f60b827 100644 --- a/analyzeme/src/lib.rs +++ b/analyzeme/src/lib.rs @@ -17,11 +17,13 @@ mod event; mod lightweight_event; mod profiling_data; mod stack_collapse; -mod timestamp; +mod stringtable; pub mod testing_common; +mod timestamp; pub use crate::event::Event; pub use crate::lightweight_event::LightweightEvent; pub use crate::profiling_data::{ProfilingData, ProfilingDataBuilder}; pub use crate::stack_collapse::collapse_stacks; +pub use crate::stringtable::{StringRef, StringTable}; pub use crate::timestamp::Timestamp; diff --git a/analyzeme/src/profiling_data.rs b/analyzeme/src/profiling_data.rs index 0bed950..9325fc1 100644 --- a/analyzeme/src/profiling_data.rs +++ b/analyzeme/src/profiling_data.rs @@ -1,12 +1,13 @@ use crate::event::Event; use crate::lightweight_event::LightweightEvent; +use crate::StringTable; use crate::timestamp::Timestamp; use measureme::file_header::{ read_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_HEADER_SIZE, FILE_MAGIC_EVENT_STREAM, }; use measureme::ByteVecSink; -use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTable, StringTableBuilder}; +use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTableBuilder}; use serde::{Deserialize, Deserializer}; use std::error::Error; use std::fs; diff --git a/analyzeme/src/stringtable.rs b/analyzeme/src/stringtable.rs new file mode 100644 index 0000000..8dae55c --- /dev/null +++ b/analyzeme/src/stringtable.rs @@ -0,0 +1,306 @@ +//! See module-level documentation `measureme::stringtable`. + +use byteorder::{BigEndian, ByteOrder, LittleEndian}; +use measureme::file_header::{ + read_file_header, strip_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_MAGIC_STRINGTABLE_DATA, + FILE_MAGIC_STRINGTABLE_INDEX, +}; +use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR}; +use measureme::{Addr, StringId}; +use rustc_hash::FxHashMap; +use std::borrow::Cow; +use std::error::Error; +use memchr::memchr; + +// See module-level documentation for more information on the encoding. +const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000; +const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000; + +fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) { + ( + StringId::reserved(LittleEndian::read_u32(&bytes[0..4])), + Addr(LittleEndian::read_u32(&bytes[4..8])), + ) +} + +#[derive(Copy, Clone)] +pub struct StringRef<'st> { + id: StringId, + table: &'st StringTable, +} + +impl<'st> StringRef<'st> { + pub fn to_string(&self) -> Cow<'st, str> { + + // Try to avoid the allocation, which we can do if this is a + // [value, 0xFF] entry. + let addr = self.table.index[&self.id]; + let pos = addr.as_usize(); + let slice_to_search = &self.table.string_data[pos..]; + + // Find the first 0xFF byte which which is either the sequence + // terminator or a byte in the middle of string id. Use `memchr` which + // is super fast. + let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap(); + + // Decode the bytes until the terminator. If there is a string id in + // between somewhere this will fail, and we fall back to the allocating + // path. + if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) { + Cow::from(s) + } else { + let mut output = String::new(); + self.write_to_string(&mut output); + Cow::from(output) + } + } + + pub fn write_to_string(&self, output: &mut String) { + let addr = self.table.index[&self.id]; + let mut pos = addr.as_usize(); + + loop { + let byte = self.table.string_data[pos]; + + if byte == TERMINATOR { + return; + } else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE { + // This is a string-id + let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]); + + // Mask off the `0b10` prefix + let id = id & STRING_ID_MASK; + + let string_ref = StringRef { + id: StringId::reserved(id), + table: self.table, + }; + + string_ref.write_to_string(output); + + pos += 4; + } else { + while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) { + output.push(c); + pos += len; + } + } + } + } +} + +// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`. +// Returns the decoded `char` and its size in bytes if it succeeds. +// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint. +// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the +// encoding. +fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> { + use std::convert::TryFrom; + let first_byte = bytes[0] as u32; + let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 { + // The highest bit is zero, so this is a single-byte char + (first_byte, 1) + } else if (first_byte & 0b1110_0000) == 0b1100_0000 { + // This is a two byte character + let bits0 = first_byte & 0b0001_1111; + let bits1 = (bytes[1] & 0b0011_1111) as u32; + + (bits0 << 6 | bits1, 2) + } else if (first_byte & 0b1111_0000) == 0b1110_0000 { + // This is a three byte character + let bits0 = first_byte & 0b0000_1111; + let bits1 = (bytes[1] & 0b0011_1111) as u32; + let bits2 = (bytes[2] & 0b0011_1111) as u32; + + ((bits0 << 12) | (bits1 << 6) | bits2, 3) + } else if (first_byte & 0b1111_1000) == 0b1111_0000 { + // This is a four byte character + let bits0 = first_byte & 0b0000_0111; + let bits1 = (bytes[1] & 0b0011_1111) as u32; + let bits2 = (bytes[2] & 0b0011_1111) as u32; + let bits3 = (bytes[3] & 0b0011_1111) as u32; + + ((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4) + } else { + return None; + }; + + match char::try_from(codepoint) { + Ok(c) => { + debug_assert!({ + let test_bytes = &mut [0u8; 8]; + c.encode_utf8(test_bytes); + &test_bytes[..len] == &bytes[..len] + }); + + Some((c, len)) + } + Err(e) => { + panic!("StringTable: Encountered invalid UTF8 char: {:?}", e); + } + } +} + +/// Read-only version of the string table +#[derive(Debug)] +pub struct StringTable { + // TODO: Replace with something lazy + string_data: Vec, + index: FxHashMap, +} + +impl StringTable { + pub fn new(string_data: Vec, index_data: Vec) -> Result> { + let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?; + let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?; + + if string_data_format != index_data_format { + Err("Mismatch between StringTable DATA and INDEX format version")?; + } + + if string_data_format != CURRENT_FILE_FORMAT_VERSION { + Err(format!( + "StringTable file format version '{}' is not supported + by this version of `measureme`.", + string_data_format + ))?; + } + + assert!(index_data.len() % 8 == 0); + let index: FxHashMap<_, _> = strip_file_header(&index_data) + .chunks(8) + .map(deserialize_index_entry) + .collect(); + + Ok(StringTable { string_data, index }) + } + + #[inline] + pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> { + StringRef { id, table: self } + } + + pub fn get_metadata<'a>(&'a self) -> StringRef<'a> { + let id = StringId::reserved(METADATA_STRING_ID); + self.get(id) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use measureme::{ByteVecSink, StringComponent, StringTableBuilder}; + use std::sync::Arc; + + #[test] + fn simple_strings() { + let data_sink = Arc::new(ByteVecSink::new()); + let index_sink = Arc::new(ByteVecSink::new()); + + let expected_strings = &[ + "abc", + "", + "xyz", + "g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y", + "", + "", + "g2h9284hgjv282y32983849&35g5y", + ]; + + let mut string_ids = vec![]; + + { + let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()); + + for &s in expected_strings { + string_ids.push(builder.alloc(s)); + } + } + + let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes(); + let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes(); + + let string_table = StringTable::new(data_bytes, index_bytes).unwrap(); + + for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) { + let str_ref = string_table.get(id); + + assert_eq!(str_ref.to_string(), expected_string); + + let mut write_to = String::new(); + str_ref.write_to_string(&mut write_to); + assert_eq!(str_ref.to_string(), write_to); + } + } + + #[test] + fn composite_string() { + let data_sink = Arc::new(ByteVecSink::new()); + let index_sink = Arc::new(ByteVecSink::new()); + + let expected_strings = &[ + "abc", // 0 + "abcabc", // 1 + "abcabcabc", // 2 + "abcabcabc", // 3 + "abcabcabc", // 4 + "abcabcabcabc", // 5 + "xxabcabcuuuabcabcqqq", // 6 + "xxxxxx", // 7 + ]; + + let mut string_ids = vec![]; + + { + let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()); + + let r = |id| StringComponent::Ref(id); + let v = |s| StringComponent::Value(s); + + string_ids.push(builder.alloc("abc")); // 0 + string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1 + string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2 + string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3 + string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4 + string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5 + string_ids.push(builder.alloc(&[ + v("xx"), + r(string_ids[1]), + v("uuu"), + r(string_ids[1]), + v("qqq"), + ])); // 6 + } + + let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes(); + let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes(); + + let string_table = StringTable::new(data_bytes, index_bytes).unwrap(); + + for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) { + let str_ref = string_table.get(id); + + assert_eq!(str_ref.to_string(), expected_string); + + let mut write_to = String::new(); + str_ref.write_to_string(&mut write_to); + assert_eq!(str_ref.to_string(), write_to); + } + } + + #[test] + fn utf8_char_decoding() { + use std::convert::TryFrom; + + // Let's just test all possible codepoints because there are not that + // many actually. + for codepoint in 0..=0x10FFFFu32 { + if let Ok(expected_char) = char::try_from(codepoint) { + let buffer = &mut [0; 4]; + let expected_len = expected_char.encode_utf8(buffer).len(); + let expected = Some((expected_char, expected_len)); + assert_eq!(expected, decode_utf8_char(&buffer[..])); + } + } + } +} diff --git a/measureme/src/file_header.rs b/measureme/src/file_header.rs index f3656b7..e32ef59 100644 --- a/measureme/src/file_header.rs +++ b/measureme/src/file_header.rs @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink; use byteorder::{ByteOrder, LittleEndian}; use std::error::Error; -pub const CURRENT_FILE_FORMAT_VERSION: u32 = 2; +pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3; pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES"; pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD"; pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI"; diff --git a/measureme/src/lib.rs b/measureme/src/lib.rs index 08312d8..9de7bde 100644 --- a/measureme/src/lib.rs +++ b/measureme/src/lib.rs @@ -46,7 +46,7 @@ mod mmap_serialization_sink; mod profiler; mod raw_event; mod serialization; -mod stringtable; +pub mod stringtable; pub mod rustc; @@ -57,6 +57,4 @@ pub use crate::mmap_serialization_sink::MmapSerializationSink; pub use crate::profiler::{Profiler, ProfilerFiles, TimingGuard}; pub use crate::raw_event::{RawEvent, MAX_INSTANT_TIMESTAMP, MAX_INTERVAL_TIMESTAMP}; pub use crate::serialization::{Addr, ByteVecSink, SerializationSink}; -pub use crate::stringtable::{ - SerializableString, StringId, StringRef, StringTable, StringTableBuilder, -}; +pub use crate::stringtable::{SerializableString, StringComponent, StringId, StringTableBuilder}; diff --git a/measureme/src/stringtable.rs b/measureme/src/stringtable.rs index a3fcc15..e4efab8 100644 --- a/measureme/src/stringtable.rs +++ b/measureme/src/stringtable.rs @@ -1,24 +1,55 @@ //! A string table implementation with a tree-like encoding. //! -//! Each entry in the table represents a string and encoded is a list of +//! Each entry in the table represents a string and is encoded as a list of //! components where each component can either be //! -//! 1. a TAG_STR_VAL that contains actual string content, -//! 2. a TAG_STR_REF that contains a reference to another entry, or -//! 3. a TAG_TERMINATOR which marks the end of a component list. +//! 1. a string _value_ that contains actual UTF-8 string content, +//! 2. a string _ID_ that contains a reference to another entry, or +//! 3. a terminator tag which marks the end of a component list. //! -//! The string content of an entry is defined as the concatenation of the -//! content of its components. The content of a `TAG_STR_VAL` is its actual -//! UTF-8 bytes. The content of a `TAG_STR_REF` is the contents of the entry +//! The string _content_ of an entry is defined as the concatenation of the +//! content of its components. The content of a string value is its actual +//! UTF-8 bytes. The content of a string ID is the contents of the entry //! it references. //! -//! Each string is referred to via a `StringId`. `StringId`s may be generated in two ways: -//! 1. Calling `StringTable::alloc()` which returns the `StringId` for the allocated string. -//! 2. Calling `StringTable::alloc_with_reserved_id()` and `StringId::reserved()`. +//! The byte-level encoding of component lists uses the structure of UTF-8 in +//! order to save space: //! -//! Reserved strings allow you to deduplicate strings by allocating a string once and then referring -//! to it by id over and over. This is a useful trick for strings which are recorded many times and -//! it can significantly reduce the size of profile trace files. +//! - A valid UTF-8 codepoint never starts with the bits `10` as this bit +//! prefix is reserved for bytes in the middle of a UTF-8 codepoint byte +//! sequence. We make use of this fact by letting all string ID components +//! start with this `10` prefix. Thus when we parse the contents of a value +//! we know to stop if the start byte of the next codepoint has this prefix. +//! +//! - A valid UTF-8 string cannot contain the `0xFF` byte and since string IDs +//! start with `10` as described above, they also cannot start with a `0xFF` +//! byte. Thus we can safely use `0xFF` as our component list terminator. +//! +//! The sample composite string ["abc", ID(42), "def", TERMINATOR] would thus be +//! encoded as: +//! +//! ```ignore +//! ['a', 'b' , 'c', 128, 0, 0, 42, 'd', 'e', 'f', 255] +//! ^^^^^^^^^^^^^ ^^^ +//! string ID 42 with 0b10 prefix terminator (0xFF) +//! ``` +//! +//! As you can see string IDs are encoded in big endian format so that highest +//! order bits show up in the first byte we encounter. +//! +//! ---------------------------------------------------------------------------- +//! +//! Each string in the table is referred to via a `StringId`. `StringId`s may +//! be generated in two ways: +//! +//! 1. Calling `StringTable::alloc()` which returns the `StringId` for the +//! allocated string. +//! 2. Calling `StringTable::alloc_with_reserved_id()` and `StringId::reserved()`. +//! +//! String IDs allow you to deduplicate strings by allocating a string +//! once and then referring to it by id over and over. This is a useful trick +//! for strings which are recorded many times and it can significantly reduce +//! the size of profile trace files. //! //! `StringId`s are partitioned according to type: //! @@ -28,16 +59,13 @@ //! After `MAX_PRE_RESERVED_STRING_ID`, there is one string id (`METADATA_STRING_ID`) which is used //! internally by `measureme` to record additional metadata about the profiling session. //! After `METADATA_STRING_ID` are all other `StringId` values. +//! use crate::file_header::{ - read_file_header, strip_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, - FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX, + write_file_header, FILE_MAGIC_STRINGTABLE_DATA, FILE_MAGIC_STRINGTABLE_INDEX, }; use crate::serialization::{Addr, SerializationSink}; -use byteorder::{ByteOrder, LittleEndian}; -use rustc_hash::FxHashMap; -use std::borrow::Cow; -use std::error::Error; +use byteorder::{BigEndian, ByteOrder, LittleEndian}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; @@ -49,6 +77,7 @@ pub struct StringId(u32); impl StringId { #[inline] pub fn reserved(id: u32) -> StringId { + assert!(id == id & STRING_ID_MASK); StringId(id) } @@ -58,22 +87,18 @@ impl StringId { } } -// Tags for the binary encoding of strings +// See module-level documentation for more information on the encoding. +pub const TERMINATOR: u8 = 0xFF; -/// Marks the end of a string component list. -const TAG_TERMINATOR: u8 = 0; - -/// Marks a component that contains actual string data. -const TAG_STR_VAL: u8 = 1; - -/// Marks a component that contains the ID of another string. -const TAG_STR_REF: u8 = 2; +// All 1s except for the two highest bits. +pub const MAX_STRING_ID: u32 = 0x3FFF_FFFF; +pub const STRING_ID_MASK: u32 = 0x3FFF_FFFF; /// The maximum id value a prereserved string may be. -const MAX_PRE_RESERVED_STRING_ID: u32 = std::u32::MAX / 2; +const MAX_PRE_RESERVED_STRING_ID: u32 = MAX_STRING_ID / 2; /// The id of the profile metadata string entry. -pub(crate) const METADATA_STRING_ID: u32 = MAX_PRE_RESERVED_STRING_ID + 1; +pub const METADATA_STRING_ID: u32 = MAX_PRE_RESERVED_STRING_ID + 1; /// Write-only version of the string table pub struct StringTableBuilder { @@ -89,28 +114,19 @@ pub trait SerializableString { fn serialize(&self, bytes: &mut [u8]); } -// A simple string is encoded as -// -// [TAG_STR_VAL, len: u16, utf8_bytes, TAG_TERMINATOR] -// -// in the string table. +// A single string is encoded as `[UTF-8 bytes][TERMINATOR]` impl SerializableString for str { #[inline] fn serialized_size(&self) -> usize { - 1 + // tag - 2 + // len self.len() + // actual bytes 1 // terminator } #[inline] fn serialize(&self, bytes: &mut [u8]) { - assert!(self.len() <= std::u16::MAX as usize); let last_byte_index = bytes.len() - 1; - bytes[0] = TAG_STR_VAL; - LittleEndian::write_u16(&mut bytes[1..3], self.len() as u16); - bytes[3..last_byte_index].copy_from_slice(self.as_bytes()); - bytes[last_byte_index] = TAG_TERMINATOR; + bytes[0..last_byte_index].copy_from_slice(self.as_bytes()); + bytes[last_byte_index] = TERMINATOR; } } @@ -120,18 +136,87 @@ pub enum StringComponent<'s> { Ref(StringId), } +impl<'s> StringComponent<'s> { + #[inline] + fn serialized_size(&self) -> usize { + match *self { + StringComponent::Value(s) => s.len(), + StringComponent::Ref(_) => 4, + } + } + + #[inline] + fn serialize<'b>(&self, bytes: &'b mut [u8]) -> &'b mut [u8] { + match *self { + StringComponent::Value(s) => { + bytes[..s.len()].copy_from_slice(s.as_bytes()); + &mut bytes[s.len()..] + } + StringComponent::Ref(string_id) => { + assert!(string_id.0 == string_id.0 & STRING_ID_MASK); + let tagged = string_id.0 | (1u32 << 31); + + BigEndian::write_u32(&mut bytes[0..4], tagged); + &mut bytes[4..] + } + } + } +} + impl<'a> SerializableString for [StringComponent<'a>] { #[inline] fn serialized_size(&self) -> usize { - unimplemented!() + self.iter().map(|c| c.serialized_size()).sum::() + // size of components + 1 // terminator } #[inline] - fn serialize(&self, _bytes: &mut [u8]) { - unimplemented!() + fn serialize(&self, mut bytes: &mut [u8]) { + assert!(bytes.len() == self.serialized_size()); + for component in self.iter() { + bytes = component.serialize(bytes); + } + + // Assert that we used the exact number of bytes we anticipated. + assert!(bytes.len() == 1); + bytes[0] = TERMINATOR; } } +macro_rules! impl_serializable_string_for_fixed_size { + ($n:expr) => { + impl<'a> SerializableString for [StringComponent<'a>; $n] { + #[inline(always)] + fn serialized_size(&self) -> usize { + (&self[..]).serialized_size() + } + + #[inline(always)] + fn serialize(&self, bytes: &mut [u8]) { + (&self[..]).serialize(bytes); + } + } + }; +} + +impl_serializable_string_for_fixed_size!(0); +impl_serializable_string_for_fixed_size!(1); +impl_serializable_string_for_fixed_size!(2); +impl_serializable_string_for_fixed_size!(3); +impl_serializable_string_for_fixed_size!(4); +impl_serializable_string_for_fixed_size!(5); +impl_serializable_string_for_fixed_size!(6); +impl_serializable_string_for_fixed_size!(7); +impl_serializable_string_for_fixed_size!(8); +impl_serializable_string_for_fixed_size!(9); +impl_serializable_string_for_fixed_size!(10); +impl_serializable_string_for_fixed_size!(11); +impl_serializable_string_for_fixed_size!(12); +impl_serializable_string_for_fixed_size!(13); +impl_serializable_string_for_fixed_size!(14); +impl_serializable_string_for_fixed_size!(15); +impl_serializable_string_for_fixed_size!(16); + fn serialize_index_entry(sink: &S, id: StringId, addr: Addr) { sink.write_atomic(8, |bytes| { LittleEndian::write_u32(&mut bytes[0..4], id.0); @@ -139,13 +224,6 @@ fn serialize_index_entry(sink: &S, id: StringId, addr: Add }); } -fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) { - ( - StringId(LittleEndian::read_u32(&bytes[0..4])), - Addr(LittleEndian::read_u32(&bytes[4..8])), - ) -} - impl StringTableBuilder { pub fn new(data_sink: Arc, index_sink: Arc) -> StringTableBuilder { // The first thing in every file we generate must be the file header. @@ -159,7 +237,6 @@ impl StringTableBuilder { } } - #[inline] pub fn alloc_with_reserved_id( &self, id: StringId, @@ -176,10 +253,10 @@ impl StringTableBuilder { id } - #[inline] pub fn alloc(&self, s: &STR) -> StringId { let id = StringId(self.id_counter.fetch_add(1, Ordering::SeqCst)); - debug_assert!(id.0 > METADATA_STRING_ID); + assert!(id.0 > METADATA_STRING_ID); + assert!(id.0 <= MAX_STRING_ID); self.alloc_unchecked(id, s); id } @@ -194,163 +271,3 @@ impl StringTableBuilder { serialize_index_entry(&*self.index_sink, id, addr); } } - -#[derive(Copy, Clone)] -pub struct StringRef<'st> { - id: StringId, - table: &'st StringTable, -} - -impl<'st> StringRef<'st> { - pub fn to_string(&self) -> Cow<'st, str> { - let addr = self.table.index[&self.id].as_usize(); - let tag = self.table.string_data[addr]; - - match tag { - TAG_STR_VAL => { - let len = - LittleEndian::read_u16(&self.table.string_data[addr + 1..addr + 3]) as usize; - let next_component_addr = addr + 3 + len; - let next_tag = self.table.string_data[next_component_addr]; - - if next_tag == TAG_TERMINATOR { - let bytes = &self.table.string_data[addr + 3..addr + 3 + len]; - return Cow::from(std::str::from_utf8(bytes).unwrap()); - } - } - TAG_TERMINATOR => { - return Cow::from(""); - } - _ => { - // we have to take the allocating path - } - } - - let mut output = String::new(); - self.write_to_string(&mut output); - Cow::from(output) - } - - pub fn write_to_string(&self, output: &mut String) { - let addr = self.table.index[&self.id]; - - let mut pos = addr.as_usize(); - - loop { - let tag = self.table.string_data[pos]; - - match tag { - TAG_STR_VAL => { - pos += 1; - let len = - LittleEndian::read_u16(&self.table.string_data[pos..pos + 2]) as usize; - pos += 2; - let bytes = &self.table.string_data[pos..pos + len]; - let s = std::str::from_utf8(bytes).unwrap(); - output.push_str(s); - pos += len; - } - - TAG_STR_REF => { - unimplemented!(); - } - - TAG_TERMINATOR => return, - - _ => unreachable!(), - } - } - } -} - -/// Read-only version of the string table -#[derive(Debug)] -pub struct StringTable { - // TODO: Replace with something lazy - string_data: Vec, - index: FxHashMap, -} - -impl StringTable { - pub fn new(string_data: Vec, index_data: Vec) -> Result> { - let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?; - let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?; - - if string_data_format != index_data_format { - Err("Mismatch between StringTable DATA and INDEX format version")?; - } - - if string_data_format != CURRENT_FILE_FORMAT_VERSION { - Err(format!( - "StringTable file format version '{}' is not supported - by this version of `measureme`.", - string_data_format - ))?; - } - - assert!(index_data.len() % 8 == 0); - let index: FxHashMap<_, _> = strip_file_header(&index_data) - .chunks(8) - .map(deserialize_index_entry) - .collect(); - - Ok(StringTable { string_data, index }) - } - - #[inline] - pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> { - StringRef { id, table: self } - } - pub fn get_metadata<'a>(&'a self) -> StringRef<'a> { - let id = StringId(METADATA_STRING_ID); - self.get(id) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_strings() { - use crate::serialization::ByteVecSink; - - let data_sink = Arc::new(ByteVecSink::new()); - let index_sink = Arc::new(ByteVecSink::new()); - - let expected_strings = &[ - "abc", - "", - "xyz", - "g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y", - "", - "", - "g2h9284hgjv282y32983849&35g5y", - ]; - - let mut string_ids = vec![]; - - { - let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone()); - - for &s in expected_strings { - string_ids.push(builder.alloc(s)); - } - } - - let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes(); - let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes(); - - let string_table = StringTable::new(data_bytes, index_bytes).unwrap(); - - for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) { - let str_ref = string_table.get(id); - - assert_eq!(str_ref.to_string(), expected_string); - - let mut write_to = String::new(); - str_ref.write_to_string(&mut write_to); - assert_eq!(str_ref.to_string(), write_to); - } - } -}