From 3468a0ff9ca41a527b03bf4555a452c0c1d06d93 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Wed, 3 Nov 2021 18:46:41 +0000 Subject: [PATCH 1/3] Make `path::Display` prefer user paths over verbatim Verbatim paths are essentially an API hack to allow smuggling NT kernel paths through the Win32 API. This allows bypassing certain historic limits of the Win32 APIs. Most users do not normally encounter them and can find it confusing if they do show up. So `Path::Display` will, in the common case, display user-style paths instead verbatim. --- library/std/src/path.rs | 26 +++++++++++++++++++++++++- library/std/src/path/tests.rs | 14 ++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index dc0c735a06c6f..8900a4d4b41d9 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -2841,6 +2841,11 @@ impl fmt::Debug for Path { /// println!("{}", path.display()); /// ``` /// +/// # Windows +/// +/// Verbatim paths may be converted to their more familiar form. +/// For example, `\\?\C:\Program Files\Rust` may display as `C:\Program Files\Rust`. +/// /// [`Display`]: fmt::Display /// [`format!`]: crate::format #[stable(feature = "rust1", since = "1.0.0")] @@ -2858,7 +2863,26 @@ impl fmt::Debug for Display<'_> { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Display<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - self.path.inner.display(f) + let path = if cfg!(windows) && self.path.as_u8_slice().starts_with(br"\\?\") { + // Convert Windows drive verbatim and UNC verbatim paths to their + // user path equivalents. + // SAFETY: Paths are only ever split on ASCII boundaries. + match self.path.as_u8_slice()[4..] { + // \\?\C:\, \\?\D:\, etc to C:\, D:\, etc + ref path @ [drive @ _, b':', b'\\', ..] if drive.is_ascii_alphabetic() => unsafe { + Path::from_u8_slice(path) + }, + // \\?\UNC\ to \\ + [b'U', b'N', b'C', b'\\', ref path @ ..] => { + f.write_str(r"\\")?; + unsafe { Path::from_u8_slice(path) } + } + _ => self.path, + } + } else { + self.path + }; + path.inner.display(f) } } diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs index 0a16ff2a721ce..27747b0691571 100644 --- a/library/std/src/path/tests.rs +++ b/library/std/src/path/tests.rs @@ -1090,6 +1090,20 @@ pub fn test_decompositions_windows() { file_prefix: Some(".x") ); } +#[test] +#[cfg(windows)] +pub fn windows_display_user_paths() { + fn check(path: &str, expected: &str) { + assert_eq!(&Path::new(path).display().to_string(), expected); + } + check(r"\\?\UNC\server\share", r"\\server\share"); + check(r"\\?\C:\path", r"C:\path"); + check(r"\\?\C:\", r"C:\"); + + // This should not change. + // `\\?\C:` is an absolute path while `C:` is a "drive relative" path. + check(r"\\?\C:", r"\\?\C:"); +} #[test] pub fn test_stem_ext() { From eabdc033f1d84405c9fdc967f3578584250efadc Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Fri, 5 Nov 2021 19:49:56 +0000 Subject: [PATCH 2/3] `try_from_verbatim` --- library/std/src/path.rs | 25 ++--- library/std/src/path/tests.rs | 136 +++++++++++++++++++++++++++- library/std/src/sys/windows/path.rs | 105 +++++++++++++++++++++ 3 files changed, 248 insertions(+), 18 deletions(-) diff --git a/library/std/src/path.rs b/library/std/src/path.rs index 8900a4d4b41d9..ce1901fb4f7c6 100644 --- a/library/std/src/path.rs +++ b/library/std/src/path.rs @@ -2863,22 +2863,17 @@ impl fmt::Debug for Display<'_> { #[stable(feature = "rust1", since = "1.0.0")] impl fmt::Display for Display<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let path = if cfg!(windows) && self.path.as_u8_slice().starts_with(br"\\?\") { - // Convert Windows drive verbatim and UNC verbatim paths to their - // user path equivalents. - // SAFETY: Paths are only ever split on ASCII boundaries. - match self.path.as_u8_slice()[4..] { - // \\?\C:\, \\?\D:\, etc to C:\, D:\, etc - ref path @ [drive @ _, b':', b'\\', ..] if drive.is_ascii_alphabetic() => unsafe { - Path::from_u8_slice(path) - }, - // \\?\UNC\ to \\ - [b'U', b'N', b'C', b'\\', ref path @ ..] => { - f.write_str(r"\\")?; - unsafe { Path::from_u8_slice(path) } - } - _ => self.path, + #[cfg(not(windows))] + let path = self.path; + #[cfg(windows)] + let path = if let Some((root, subpath)) = + crate::sys::path::try_from_verbatim(self.path.as_u8_slice()) + { + if root.is_unc() { + f.write_str(r"\\")?; } + // SAFETY: The path will only be split after an ASCII root. + unsafe { Path::from_u8_slice(subpath) } } else { self.path }; diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs index 27747b0691571..ec26f84a878ad 100644 --- a/library/std/src/path/tests.rs +++ b/library/std/src/path/tests.rs @@ -1096,13 +1096,143 @@ pub fn windows_display_user_paths() { fn check(path: &str, expected: &str) { assert_eq!(&Path::new(path).display().to_string(), expected); } - check(r"\\?\UNC\server\share", r"\\server\share"); + fn unchanged(path: &str) { + check(path, path); + } + + // Make sure non-verbatim paths aren't changed + unchanged(r"path\to\file"); + unchanged(r".\path\to\file"); + unchanged(r"..\path\to\file"); + unchanged(r"C:\path\to\file"); + unchanged(r"\\server\share\path\to\file"); + unchanged(r"\\.\server\share\path\to\file"); + unchanged(r"//?\UNC\server\share\path\to\file"); + + // The simple cases. check(r"\\?\C:\path", r"C:\path"); check(r"\\?\C:\", r"C:\"); + check(r"\\?\UNC\server\share", r"\\server\share"); + check(r"\\?\UNC\server\share\", r"\\server\share\"); + check(r"\\?\UNC\server\share\path", r"\\server\share\path"); - // This should not change. // `\\?\C:` is an absolute path while `C:` is a "drive relative" path. - check(r"\\?\C:", r"\\?\C:"); + unchanged(r"\\?\C:"); + + // We only change drive and UNC paths, not device paths. + unchanged(r"\\?\pipe\name"); + + // Empty components are nonsensical but can be represented in verbatim paths + unchanged(r"\\?\C:\path\\to\file"); + unchanged(r"\\?\UNC\server\share\path\\to\file"); + + // Verbatim `.` and `..` components have no user path equivalent. + unchanged(r"\\?\C:\path\..\file"); + unchanged(r"\\?\C:\path\.\file"); + unchanged(r"\\?\C:\path\file\."); + unchanged(r"\\?\C:\path\file\.."); + + unchanged(r"\\?\UNC\server\share\path\..\file"); + unchanged(r"\\?\UNC\server\share\path\.\file"); + unchanged(r"\\?\UNC\server\share\path\file\."); + unchanged(r"\\?\UNC\server\share\path\file\.."); + + // All trailing dots and spaces are stripped from user paths. + unchanged(r"\\?\C:\path\to\file.."); + unchanged(r"\\?\C:\path\to\file "); + unchanged(r"\\?\C:\path\to\file.. .."); + + unchanged(r"\\?\UNC\server\share\path\to\file.."); + unchanged(r"\\?\UNC\server\share\path\to\file "); + unchanged(r"\\?\UNC\server\share\path\to\file.. .."); + + // A single trailing dot in an interior component will be stripped by + // non-verbatim paths... + unchanged(r"\\?\C:\path\to.\file"); + unchanged(r"\\?\UNC\server\share\path\to.\file"); + + // ...but two dots won't be. Don't ask. + check(r"\\?\C:\path\to..\file", r"C:\path\to..\file"); + check(r"\\?\C:\path\to..\file", r"C:\path\to..\file"); + + check(r"\\?\UNC\server\share\path\to..\file", r"\\server\share\path\to..\file"); + check(r"\\?\UNC\server\share\path\to..\file", r"\\server\share\path\to..\file"); + + // Dots elsewhere are fine. + check(r"\\?\C:\path\.to\file", r"C:\path\.to\file"); + check(r"\\?\C:\path\..to\file", r"C:\path\..to\file"); + check(r"\\?\C:\path\t.o\file", r"C:\path\t.o\file"); + check(r"\\?\C:\path\t..o\file", r"C:\path\t..o\file"); + + check(r"\\?\UNC\server\share\path\.to\file", r"\\server\share\path\.to\file"); + check(r"\\?\UNC\server\share\path\..to\file", r"\\server\share\path\..to\file"); + check(r"\\?\UNC\server\share\path\t.o\file", r"\\server\share\path\t.o\file"); + check(r"\\?\UNC\server\share\path\t..o\file", r"\\server\share\path\t..o\file"); + + // Verbatim `/` has not user path equivalent. + unchanged(r"\\?\C:\path/to\file"); + unchanged(r"\\?\UNC\server\share\path/to\file"); + + // Legacy dos device names are converted to `\\.\` paths, but only for drive paths. + unchanged(r"\\?\C:\path\to\AUX"); + unchanged(r"\\?\C:\path\to\NUL"); + unchanged(r"\\?\C:\path\to\PRN"); + unchanged(r"\\?\C:\path\to\CON"); + unchanged(r"\\?\C:\path\to\CONIN$"); + unchanged(r"\\?\C:\path\to\CONOUT$"); + + unchanged(r"\\?\C:\path\to\COM1"); + unchanged(r"\\?\C:\path\to\COM2"); + unchanged(r"\\?\C:\path\to\COM3"); + unchanged(r"\\?\C:\path\to\COM4"); + unchanged(r"\\?\C:\path\to\COM5"); + unchanged(r"\\?\C:\path\to\COM6"); + unchanged(r"\\?\C:\path\to\COM7"); + unchanged(r"\\?\C:\path\to\COM8"); + unchanged(r"\\?\C:\path\to\COM9"); + + unchanged(r"\\?\C:\path\to\LPT1"); + unchanged(r"\\?\C:\path\to\LPT2"); + unchanged(r"\\?\C:\path\to\LPT3"); + unchanged(r"\\?\C:\path\to\LPT4"); + unchanged(r"\\?\C:\path\to\LPT5"); + unchanged(r"\\?\C:\path\to\LPT6"); + unchanged(r"\\?\C:\path\to\LPT7"); + unchanged(r"\\?\C:\path\to\LPT8"); + unchanged(r"\\?\C:\path\to\LPT9"); + + // Yes, these are superscript digits. The legend goes that someone once used + // the wrong "is a digit" function and now it can't be changed due to + // stability guarantees. + unchanged(r"\\?\C:\path\to\COM²"); + unchanged(r"\\?\C:\path\to\COM³"); + unchanged(r"\\?\C:\path\to\COM¹"); + unchanged(r"\\?\C:\path\to\LPT²"); + unchanged(r"\\?\C:\path\to\LPT³"); + unchanged(r"\\?\C:\path\to\LPT¹"); + + // DOS device names are case-insensitive + unchanged(r"\\?\C:\aux"); + unchanged(r"\\?\C:\CoM4"); + unchanged(r"\\?\C:\cOnOuT$"); + + // Everything after a dot is ignored for the sake of parsing the device name. + unchanged(r"\\?\C:\path\to\LPT¹.txt"); + // Spaces are ignored too. + unchanged(r"\\?\C:\path\to\LPT¹ "); + // And these two rules can be combined. + unchanged(r"\\?\C:\path\to\LPT¹ .txt"); + + // UNC paths can have DOS devices even when not verbatim. + check(r"\\?\UNC\server\share\AUX", r"\\server\share\AUX"); + + // In UNC paths, the server name and share name are never changed, even + // without the verbatim prefix... + check(r"\\?\UNC\..\share.\", r"\\..\share.\"); + + // ... aside from the forward slash which is a path separator. + unchanged(r"\\?\UNC\ser/ver\share\"); + unchanged(r"\\?\UNC\server\sh/are\"); } #[test] diff --git a/library/std/src/sys/windows/path.rs b/library/std/src/sys/windows/path.rs index 460c1eff7788d..88b50c3006296 100644 --- a/library/std/src/sys/windows/path.rs +++ b/library/std/src/sys/windows/path.rs @@ -242,3 +242,108 @@ pub(crate) fn maybe_verbatim(path: &Path) -> io::Result> { )?; Ok(path) } + +#[derive(PartialEq, Eq, Clone, Copy)] +pub(crate) enum Root { + Drive, + Unc, + //Device, +} +impl Root { + pub fn is_unc(self) -> bool { + self == Root::Unc + } +} + +/// If the given verbatim path can be losslessly converted to a user path, +/// then this returns the [`Root`] type and the subpath. +/// Otherwise it returns `None`. +pub(crate) fn try_from_verbatim(path: &[u8]) -> Option<(Root, &[u8])> { + if !path.starts_with(br"\\?\") { + return None; + } + // Parse the root type. + let (root, subpath) = match path[4..] { + ref subpath @ [drive @ _, b':', b'\\', ..] if drive.is_ascii_alphabetic() => { + (Root::Drive, subpath) + } + [b'U', b'N', b'C', b'\\', ref subpath @ ..] => (Root::Unc, subpath), + _ => return None, + }; + + let mut components = subpath.split_inclusive(|&b| b == b'\\'); + let mut filename = None; + if root == Root::Unc { + // Skip the first two components. + for component in components.by_ref().take(2) { + if component.contains(&b'/') { + return None; + } + } + } + for component in components { + if component.contains(&b'/') { + return None; + } + match component { + br"\" | br".\" | br"..\" => return None, + // Ends with one and only one dot. + [.., b @ _, b'.', b'\\'] if *b != b'.' => return None, + _ => {} + } + filename.replace(component); + } + if let Some(name) = filename { + if matches!(name.last(), Some(b'.') | Some(b' ')) + || (root == Root::Drive && is_dos_device(name)) + { + return None; + } + } + Some((root, subpath)) +} + +/// Returns true if the filename is the name of a DOS device. +fn is_dos_device(filename: &[u8]) -> bool { + // The UTF-8 encoding of "²", "³" and "¹" is two bytes and starts with 0xc2. + const SUPER: u8 = 0xc2; + const SUPER_2: u8 = 0xb2; // ² + const SUPER_3: u8 = 0xb3; // ³ + const SUPER_1: u8 = 0xb9; // ¹ + + let upper = { + let mut upper = [0u8; 7]; + for (a, &b) in upper.iter_mut().zip(filename.iter()) { + *a = b.to_ascii_uppercase(); + } + upper + }; + let tail = match &upper[..3] { + b"AUX" | b"NUL" | b"PRN" => &filename[3..], + b"CON" => match &upper[3..] { + // Disambiguate `CON`, `CONIN$` and `CONOUT$`. + b"OUT$" => &filename[7..], + [b'I', b'N', b'$', _] => &filename[6..], + _ => &filename[3..], + }, + b"COM" | b"LPT" => match upper[3] { + // Match digit + b'1'..=b'9' => &filename[4..], + // Test for the two byte super-script numbers. + SUPER if matches!(upper[4], SUPER_1 | SUPER_2 | SUPER_3) => &filename[5..], + _ => return false, + }, + _ => return false, + }; + // Trailing spaces are ignored. + // A `.` marks the end of the device name. + // Anything else means this is not a device name. + let mut iter = tail.iter(); + loop { + match iter.next() { + None | Some(b'.') => break true, + Some(b' ') => continue, + _ => break false, + } + } +} From 26e8a0a66d2496e2bd828ac5f8d54be833ab55c4 Mon Sep 17 00:00:00 2001 From: Chris Denton Date: Fri, 5 Nov 2021 20:21:56 +0000 Subject: [PATCH 3/3] Remove duplicate test --- library/std/src/path/tests.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/library/std/src/path/tests.rs b/library/std/src/path/tests.rs index ec26f84a878ad..288f02a58a326 100644 --- a/library/std/src/path/tests.rs +++ b/library/std/src/path/tests.rs @@ -1153,9 +1153,6 @@ pub fn windows_display_user_paths() { // ...but two dots won't be. Don't ask. check(r"\\?\C:\path\to..\file", r"C:\path\to..\file"); - check(r"\\?\C:\path\to..\file", r"C:\path\to..\file"); - - check(r"\\?\UNC\server\share\path\to..\file", r"\\server\share\path\to..\file"); check(r"\\?\UNC\server\share\path\to..\file", r"\\server\share\path\to..\file"); // Dots elsewhere are fine.