From d9091df6cfc51db80fbec49ce7b00fb5e57e6a73 Mon Sep 17 00:00:00 2001 From: John Kugelman Date: Sat, 9 Oct 2021 15:11:20 -0400 Subject: [PATCH] Add Read::is_append_only method to optimize read_to_string I've added a new method called `is_append_only` to the `Read` trait. It returns true if `read_to_end` and `read_to_string` can be trusted not to read or overwrite any existing data in their input buffer. These readers use it to signal that they have optimized `read_to_end` methods: * `[u8]` extends the input buffer to the full slice size all at once instead of doubling its size repeatedly until it's big enough. * `File` pre-allocates enough space for the full file size, allowing for fewer `read` syscalls and reducing buffer reallocations. These readers take advantage of it: * If `BufReader` wraps a `[u8]` or `File` its `read_to_string` will call their optimized `read_to_end`s. Overall, this fixes a slow path where `BufReader::read_to_string` would be forced to read into a side buffer when passed a non-empty input buffer. --- library/std/src/fs.rs | 14 ++--- library/std/src/io/buffered/bufreader.rs | 36 +---------- library/std/src/io/impls.rs | 14 +++++ library/std/src/io/mod.rs | 80 +++++++++++++++++------- 4 files changed, 78 insertions(+), 66 deletions(-) diff --git a/library/std/src/fs.rs b/library/std/src/fs.rs index 726c855c4fd36..6cdb729f5e87e 100644 --- a/library/std/src/fs.rs +++ b/library/std/src/fs.rs @@ -640,10 +640,9 @@ impl Read for File { io::default_read_to_end(self, buf) } - // Reserves space in the buffer based on the file size when available. - fn read_to_string(&mut self, buf: &mut String) -> io::Result { - buf.reserve(buffer_capacity_required(self)); - io::default_read_to_string(self, buf) + unsafe fn is_append_only(&self) -> bool { + // `io::default_read_to_end` is append-only. + true } } #[stable(feature = "rust1", since = "1.0.0")] @@ -698,10 +697,9 @@ impl Read for &File { io::default_read_to_end(self, buf) } - // Reserves space in the buffer based on the file size when available. - fn read_to_string(&mut self, buf: &mut String) -> io::Result { - buf.reserve(buffer_capacity_required(self)); - io::default_read_to_string(self, buf) + unsafe fn is_append_only(&self) -> bool { + // `io::default_read_to_end` is append-only. + true } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/std/src/io/buffered/bufreader.rs b/library/std/src/io/buffered/bufreader.rs index 243207a606544..c4368cc212f41 100644 --- a/library/std/src/io/buffered/bufreader.rs +++ b/library/std/src/io/buffered/bufreader.rs @@ -318,40 +318,8 @@ impl Read for BufReader { Ok(nread + self.inner.read_to_end(buf)?) } - // The inner reader might have an optimized `read_to_end`. Drain our buffer and then - // delegate to the inner implementation. - fn read_to_string(&mut self, buf: &mut String) -> io::Result { - // In the general `else` case below we must read bytes into a side buffer, check - // that they are valid UTF-8, and then append them to `buf`. This requires a - // potentially large memcpy. - // - // If `buf` is empty--the most common case--we can leverage `append_to_string` - // to read directly into `buf`'s internal byte buffer, saving an allocation and - // a memcpy. - if buf.is_empty() { - // `append_to_string`'s safety relies on the buffer only being appended to since - // it only checks the UTF-8 validity of new data. If there were existing content in - // `buf` then an untrustworthy reader (i.e. `self.inner`) could not only append - // bytes but also modify existing bytes and render them invalid. On the other hand, - // if `buf` is empty then by definition any writes must be appends and - // `append_to_string` will validate all of the new bytes. - unsafe { crate::io::append_to_string(buf, |b| self.read_to_end(b)) } - } else { - // We cannot append our byte buffer directly onto the `buf` String as there could - // be an incomplete UTF-8 sequence that has only been partially read. We must read - // everything into a side buffer first and then call `from_utf8` on the complete - // buffer. - let mut bytes = Vec::new(); - self.read_to_end(&mut bytes)?; - let string = crate::str::from_utf8(&bytes).map_err(|_| { - io::Error::new_const( - io::ErrorKind::InvalidData, - &"stream did not contain valid UTF-8", - ) - })?; - *buf += string; - Ok(string.len()) - } + unsafe fn is_append_only(&self) -> bool { + self.inner.is_append_only() } } diff --git a/library/std/src/io/impls.rs b/library/std/src/io/impls.rs index 7a2a49ba7d707..2363f1a1779b9 100644 --- a/library/std/src/io/impls.rs +++ b/library/std/src/io/impls.rs @@ -44,6 +44,11 @@ impl Read for &mut R { (**self).read_to_string(buf) } + #[inline] + unsafe fn is_append_only(&self) -> bool { + (**self).is_append_only() + } + #[inline] fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { (**self).read_exact(buf) @@ -148,6 +153,11 @@ impl Read for Box { (**self).read_to_string(buf) } + #[inline] + unsafe fn is_append_only(&self) -> bool { + (**self).is_append_only() + } + #[inline] fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> { (**self).read_exact(buf) @@ -297,6 +307,10 @@ impl Read for &[u8] { *self = &self[len..]; Ok(len) } + + unsafe fn is_append_only(&self) -> bool { + true + } } #[stable(feature = "rust1", since = "1.0.0")] diff --git a/library/std/src/io/mod.rs b/library/std/src/io/mod.rs index 8c71138aa231d..2084e845f9ad3 100644 --- a/library/std/src/io/mod.rs +++ b/library/std/src/io/mod.rs @@ -316,12 +316,11 @@ impl Drop for Guard<'_> { } } -// Several `read_to_string` and `read_line` methods in the standard library will -// append data into a `String` buffer, but we need to be pretty careful when -// doing this. The implementation will just call `.as_mut_vec()` and then -// delegate to a byte-oriented reading method, but we must ensure that when -// returning we never leave `buf` in a state such that it contains invalid UTF-8 -// in its bounds. +// A few methods below (`read_to_string`, `read_line`) will append data into a +// `String` buffer, but we need to be pretty careful when doing this. The +// implementation will just call `.as_mut_vec()` and then delegate to a +// byte-oriented reading method, but we must ensure that when returning we never +// leave `buf` in a state such that it contains invalid UTF-8 in its bounds. // // To this end, we use an RAII guard (to protect against panics) which updates // the length of the string when it is dropped. This guard initially truncates @@ -335,7 +334,7 @@ impl Drop for Guard<'_> { // 2. We're passing a raw buffer to the function `f`, and it is expected that // the function only *appends* bytes to the buffer. We'll get undefined // behavior if existing bytes are overwritten to have non-UTF-8 data. -pub(crate) unsafe fn append_to_string(buf: &mut String, f: F) -> Result +unsafe fn append_to_string(buf: &mut String, f: F) -> Result where F: FnOnce(&mut Vec) -> Result, { @@ -425,22 +424,6 @@ pub(crate) fn default_read_to_end(r: &mut R, buf: &mut Vec } } -pub(crate) fn default_read_to_string( - r: &mut R, - buf: &mut String, -) -> Result { - // Note that we do *not* call `r.read_to_end()` here. We are passing - // `&mut Vec` (the raw contents of `buf`) into the `read_to_end` - // method to fill it up. An arbitrary implementation could overwrite the - // entire contents of the vector, not just append to it (which is what - // we are expecting). - // - // To prevent extraneously checking the UTF-8-ness of the entire buffer - // we pass it to our hardcoded `default_read_to_end` implementation which - // we know is guaranteed to only read data into the end of the buffer. - unsafe { append_to_string(buf, |b| default_read_to_end(r, b)) } -} - pub(crate) fn default_read_vectored(read: F, bufs: &mut [IoSliceMut<'_>]) -> Result where F: FnOnce(&mut [u8]) -> Result, @@ -774,7 +757,56 @@ pub trait Read { /// [`std::fs::read_to_string`]: crate::fs::read_to_string #[stable(feature = "rust1", since = "1.0.0")] fn read_to_string(&mut self, buf: &mut String) -> Result { - default_read_to_string(self, buf) + // We want to delegate to `read_to_end` by passing it a `&mut Vec` + // (the raw contents of `buf`) and having it append bytes directly onto + // `buf`. `append_to_string` only needs to check the UTF-8-ness of the + // new data, not the entire buffer. + // + // This `Read`er might have an optimized `read_to_end` implementation, + // but it's only safe if we can be sure it's append-only. Otherwise, + // we'll fall back to our default implementation which we know is safe. + unsafe { + append_to_string(buf, |b| { + if self.is_append_only() { + self.read_to_end(b) + } else { + default_read_to_end(self, b) + } + }) + } + } + + /// Determines if this `Read`er will only append data in [`read_to_end`] and + /// [`read_to_string`] without reading or overwriting the buffer's existing + /// contents. + /// + /// `Read`ers are not supposed to read or overwrite existing data and callers + /// may want to perform unsafe optimizations that take advantage of this + /// restrictions. Without this method they would not be allowed to do so + /// since `read_to_end` and `read_to_string` are safe methods and there is no + /// guarantee that implementors are adhering to the contract. + /// + /// Overriding this method is unnecessary when a `Read`er uses the default + /// implementations of `read_to_end` and `read_to_string`, or if it delegates + /// to another `Read`er that uses the defaults. + /// + /// The behavior of this method must be independent of the state of the + /// `Read`er - the method only takes `&self` so that it can be used through + /// trait objects. + /// + /// # Safety + /// + /// This method is unsafe because callers may rely on the result to perform + /// unsafe optimizations. + /// + /// A `Read`er that delegates some or all of its reading to another `Read` + /// type must verify that the inner `Read`er is also append-only. + /// + /// [`read_to_end`]: Read::read_to_end + /// [`read_to_string`]: Read::read_to_string + #[unstable(feature = "append_only", issue = "none")] + unsafe fn is_append_only(&self) -> bool { + false } /// Read the exact number of bytes required to fill `buf`.