Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions analyzeme/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ edition = "2018"
license = "MIT OR Apache-2.0"

[dependencies]
byteorder = "1.2.7"
memchr = "2"
measureme = { path = "../measureme" }
rustc-hash = "1.0.1"
serde = { version = "1.0", features = [ "derive" ] }
Expand Down
4 changes: 3 additions & 1 deletion analyzeme/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ mod event;
mod lightweight_event;
mod profiling_data;
mod stack_collapse;
mod timestamp;
mod stringtable;
pub mod testing_common;
mod timestamp;

pub use crate::event::Event;
pub use crate::lightweight_event::LightweightEvent;
pub use crate::profiling_data::{ProfilingData, ProfilingDataBuilder};
pub use crate::stack_collapse::collapse_stacks;
pub use crate::stringtable::{StringRef, StringTable};
pub use crate::timestamp::Timestamp;
3 changes: 2 additions & 1 deletion analyzeme/src/profiling_data.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use crate::event::Event;
use crate::lightweight_event::LightweightEvent;
use crate::StringTable;
use crate::timestamp::Timestamp;
use measureme::file_header::{
read_file_header, write_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_HEADER_SIZE,
FILE_MAGIC_EVENT_STREAM,
};
use measureme::ByteVecSink;
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTable, StringTableBuilder};
use measureme::{ProfilerFiles, RawEvent, SerializationSink, StringTableBuilder};
use serde::{Deserialize, Deserializer};
use std::error::Error;
use std::fs;
Expand Down
306 changes: 306 additions & 0 deletions analyzeme/src/stringtable.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,306 @@
//! See module-level documentation `measureme::stringtable`.

use byteorder::{BigEndian, ByteOrder, LittleEndian};
use measureme::file_header::{
read_file_header, strip_file_header, CURRENT_FILE_FORMAT_VERSION, FILE_MAGIC_STRINGTABLE_DATA,
FILE_MAGIC_STRINGTABLE_INDEX,
};
use measureme::stringtable::{METADATA_STRING_ID, STRING_ID_MASK, TERMINATOR};
use measureme::{Addr, StringId};
use rustc_hash::FxHashMap;
use std::borrow::Cow;
use std::error::Error;
use memchr::memchr;

// See module-level documentation for more information on the encoding.
const UTF8_CONTINUATION_MASK: u8 = 0b1100_0000;
const UTF8_CONTINUATION_BYTE: u8 = 0b1000_0000;

fn deserialize_index_entry(bytes: &[u8]) -> (StringId, Addr) {
(
StringId::reserved(LittleEndian::read_u32(&bytes[0..4])),
Addr(LittleEndian::read_u32(&bytes[4..8])),
)
}

#[derive(Copy, Clone)]
pub struct StringRef<'st> {
id: StringId,
table: &'st StringTable,
}

impl<'st> StringRef<'st> {
pub fn to_string(&self) -> Cow<'st, str> {

// Try to avoid the allocation, which we can do if this is a
// [value, 0xFF] entry.
let addr = self.table.index[&self.id];
let pos = addr.as_usize();
let slice_to_search = &self.table.string_data[pos..];

// Find the first 0xFF byte which which is either the sequence
// terminator or a byte in the middle of string id. Use `memchr` which
// is super fast.
let terminator_pos = memchr(TERMINATOR, slice_to_search).unwrap();

// Decode the bytes until the terminator. If there is a string id in
// between somewhere this will fail, and we fall back to the allocating
// path.
if let Ok(s) = std::str::from_utf8(&slice_to_search[..terminator_pos]) {
Cow::from(s)
} else {
let mut output = String::new();
self.write_to_string(&mut output);
Cow::from(output)
}
}

pub fn write_to_string(&self, output: &mut String) {
let addr = self.table.index[&self.id];
let mut pos = addr.as_usize();

loop {
let byte = self.table.string_data[pos];

if byte == TERMINATOR {
return;
} else if (byte & UTF8_CONTINUATION_MASK) == UTF8_CONTINUATION_BYTE {
// This is a string-id
let id = BigEndian::read_u32(&self.table.string_data[pos..pos + 4]);

// Mask off the `0b10` prefix
let id = id & STRING_ID_MASK;

let string_ref = StringRef {
id: StringId::reserved(id),
table: self.table,
};

string_ref.write_to_string(output);

pos += 4;
} else {
while let Some((c, len)) = decode_utf8_char(&self.table.string_data[pos..]) {
output.push(c);
pos += len;
}
}
}
}
}

// Tries to decode a UTF-8 codepoint starting at the beginning of `bytes`.
// Returns the decoded `char` and its size in bytes if it succeeds.
// Returns `None` if `bytes` does not start with a valid UTF-8 codepoint.
// See https://en.wikipedia.org/wiki/UTF-8 for in-depth information on the
// encoding.
fn decode_utf8_char(bytes: &[u8]) -> Option<(char, usize)> {
use std::convert::TryFrom;
let first_byte = bytes[0] as u32;
let (codepoint, len) = if (first_byte & 0b1000_0000) == 0 {
// The highest bit is zero, so this is a single-byte char
(first_byte, 1)
} else if (first_byte & 0b1110_0000) == 0b1100_0000 {
// This is a two byte character
let bits0 = first_byte & 0b0001_1111;
let bits1 = (bytes[1] & 0b0011_1111) as u32;

(bits0 << 6 | bits1, 2)
} else if (first_byte & 0b1111_0000) == 0b1110_0000 {
// This is a three byte character
let bits0 = first_byte & 0b0000_1111;
let bits1 = (bytes[1] & 0b0011_1111) as u32;
let bits2 = (bytes[2] & 0b0011_1111) as u32;

((bits0 << 12) | (bits1 << 6) | bits2, 3)
} else if (first_byte & 0b1111_1000) == 0b1111_0000 {
// This is a four byte character
let bits0 = first_byte & 0b0000_0111;
let bits1 = (bytes[1] & 0b0011_1111) as u32;
let bits2 = (bytes[2] & 0b0011_1111) as u32;
let bits3 = (bytes[3] & 0b0011_1111) as u32;

((bits0 << 18) | (bits1 << 12) | (bits2 << 6) | bits3, 4)
} else {
return None;
};

match char::try_from(codepoint) {
Ok(c) => {
debug_assert!({
let test_bytes = &mut [0u8; 8];
c.encode_utf8(test_bytes);
&test_bytes[..len] == &bytes[..len]
});

Some((c, len))
}
Err(e) => {
panic!("StringTable: Encountered invalid UTF8 char: {:?}", e);
}
}
}

/// Read-only version of the string table
#[derive(Debug)]
pub struct StringTable {
// TODO: Replace with something lazy
string_data: Vec<u8>,
index: FxHashMap<StringId, Addr>,
}

impl StringTable {
pub fn new(string_data: Vec<u8>, index_data: Vec<u8>) -> Result<StringTable, Box<dyn Error>> {
let string_data_format = read_file_header(&string_data, FILE_MAGIC_STRINGTABLE_DATA)?;
let index_data_format = read_file_header(&index_data, FILE_MAGIC_STRINGTABLE_INDEX)?;

if string_data_format != index_data_format {
Err("Mismatch between StringTable DATA and INDEX format version")?;
}

if string_data_format != CURRENT_FILE_FORMAT_VERSION {
Err(format!(
"StringTable file format version '{}' is not supported
by this version of `measureme`.",
string_data_format
))?;
}

assert!(index_data.len() % 8 == 0);
let index: FxHashMap<_, _> = strip_file_header(&index_data)
.chunks(8)
.map(deserialize_index_entry)
.collect();

Ok(StringTable { string_data, index })
}

#[inline]
pub fn get<'a>(&'a self, id: StringId) -> StringRef<'a> {
StringRef { id, table: self }
}

pub fn get_metadata<'a>(&'a self) -> StringRef<'a> {
let id = StringId::reserved(METADATA_STRING_ID);
self.get(id)
}
}

#[cfg(test)]
mod tests {
use super::*;
use measureme::{ByteVecSink, StringComponent, StringTableBuilder};
use std::sync::Arc;

#[test]
fn simple_strings() {
let data_sink = Arc::new(ByteVecSink::new());
let index_sink = Arc::new(ByteVecSink::new());

let expected_strings = &[
"abc",
"",
"xyz",
"g2h9284hgjv282y32983849&(*^&YIJ#R)(F83 f 23 2g4 35g5y",
"",
"",
"g2h9284hgjv282y32983849&35g5y",
];

let mut string_ids = vec![];

{
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());

for &s in expected_strings {
string_ids.push(builder.alloc(s));
}
}

let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

let string_table = StringTable::new(data_bytes, index_bytes).unwrap();

for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
let str_ref = string_table.get(id);

assert_eq!(str_ref.to_string(), expected_string);

let mut write_to = String::new();
str_ref.write_to_string(&mut write_to);
assert_eq!(str_ref.to_string(), write_to);
}
}

#[test]
fn composite_string() {
let data_sink = Arc::new(ByteVecSink::new());
let index_sink = Arc::new(ByteVecSink::new());

let expected_strings = &[
"abc", // 0
"abcabc", // 1
"abcabcabc", // 2
"abcabcabc", // 3
"abcabcabc", // 4
"abcabcabcabc", // 5
"xxabcabcuuuabcabcqqq", // 6
"xxxxxx", // 7
];

let mut string_ids = vec![];

{
let builder = StringTableBuilder::new(data_sink.clone(), index_sink.clone());

let r = |id| StringComponent::Ref(id);
let v = |s| StringComponent::Value(s);

string_ids.push(builder.alloc("abc")); // 0
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0])])); // 1
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[0]), r(string_ids[0])])); // 2
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[0])])); // 3
string_ids.push(builder.alloc(&[r(string_ids[0]), r(string_ids[1])])); // 4
string_ids.push(builder.alloc(&[r(string_ids[1]), r(string_ids[1])])); // 5
string_ids.push(builder.alloc(&[
v("xx"),
r(string_ids[1]),
v("uuu"),
r(string_ids[1]),
v("qqq"),
])); // 6
}

let data_bytes = Arc::try_unwrap(data_sink).unwrap().into_bytes();
let index_bytes = Arc::try_unwrap(index_sink).unwrap().into_bytes();

let string_table = StringTable::new(data_bytes, index_bytes).unwrap();

for (&id, &expected_string) in string_ids.iter().zip(expected_strings.iter()) {
let str_ref = string_table.get(id);

assert_eq!(str_ref.to_string(), expected_string);

let mut write_to = String::new();
str_ref.write_to_string(&mut write_to);
assert_eq!(str_ref.to_string(), write_to);
}
}

#[test]
fn utf8_char_decoding() {
use std::convert::TryFrom;

// Let's just test all possible codepoints because there are not that
// many actually.
for codepoint in 0..=0x10FFFFu32 {
if let Ok(expected_char) = char::try_from(codepoint) {
let buffer = &mut [0; 4];
let expected_len = expected_char.encode_utf8(buffer).len();
let expected = Some((expected_char, expected_len));
assert_eq!(expected, decode_utf8_char(&buffer[..]));
}
}
}
}
2 changes: 1 addition & 1 deletion measureme/src/file_header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::serialization::SerializationSink;
use byteorder::{ByteOrder, LittleEndian};
use std::error::Error;

pub const CURRENT_FILE_FORMAT_VERSION: u32 = 2;
pub const CURRENT_FILE_FORMAT_VERSION: u32 = 3;
pub const FILE_MAGIC_EVENT_STREAM: &[u8; 4] = b"MMES";
pub const FILE_MAGIC_STRINGTABLE_DATA: &[u8; 4] = b"MMSD";
pub const FILE_MAGIC_STRINGTABLE_INDEX: &[u8; 4] = b"MMSI";
Expand Down
6 changes: 2 additions & 4 deletions measureme/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ mod mmap_serialization_sink;
mod profiler;
mod raw_event;
mod serialization;
mod stringtable;
pub mod stringtable;

pub mod rustc;

Expand All @@ -57,6 +57,4 @@ pub use crate::mmap_serialization_sink::MmapSerializationSink;
pub use crate::profiler::{Profiler, ProfilerFiles, TimingGuard};
pub use crate::raw_event::{RawEvent, MAX_INSTANT_TIMESTAMP, MAX_INTERVAL_TIMESTAMP};
pub use crate::serialization::{Addr, ByteVecSink, SerializationSink};
pub use crate::stringtable::{
SerializableString, StringId, StringRef, StringTable, StringTableBuilder,
};
pub use crate::stringtable::{SerializableString, StringComponent, StringId, StringTableBuilder};
Loading