Skip to content

Refactor and fix parse_prefix on Windows #78833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions library/std/src/ffi/os_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -667,10 +667,10 @@ impl OsStr {

/// Gets the underlying byte representation.
///
/// Note: it is *crucial* that this API is private, to avoid
/// Note: it is *crucial* that this API is not externally public, to avoid
/// revealing the internal, platform-specific encodings.
#[inline]
fn bytes(&self) -> &[u8] {
pub(crate) fn bytes(&self) -> &[u8] {
unsafe { &*(&self.inner as *const _ as *const [u8]) }
}

Expand Down
8 changes: 4 additions & 4 deletions library/std/src/path/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -873,12 +873,12 @@ pub fn test_decompositions_windows() {
);

t!("\\\\.\\foo/bar",
iter: ["\\\\.\\foo/bar", "\\"],
iter: ["\\\\.\\foo", "\\", "bar"],
has_root: true,
is_absolute: true,
parent: None,
file_name: None,
file_stem: None,
parent: Some("\\\\.\\foo/"),
file_name: Some("bar"),
file_stem: Some("bar"),
extension: None
);

Expand Down
174 changes: 104 additions & 70 deletions library/std/src/sys/windows/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@ mod tests;
pub const MAIN_SEP_STR: &str = "\\";
pub const MAIN_SEP: char = '\\';

// The unsafety here stems from converting between `&OsStr` and `&[u8]`
// and back. This is safe to do because (1) we only look at ASCII
// contents of the encoding and (2) new &OsStr values are produced
// only from ASCII-bounded slices of existing &OsStr values.
fn os_str_as_u8_slice(s: &OsStr) -> &[u8] {
unsafe { mem::transmute(s) }
}
unsafe fn u8_slice_as_os_str(s: &[u8]) -> &OsStr {
mem::transmute(s)
// Safety: `bytes` must be a valid wtf8 encoded slice
#[inline]
unsafe fn bytes_as_os_str(bytes: &[u8]) -> &OsStr {
// &OsStr is layout compatible with &Slice, which is compatible with &Wtf8,
// which is compatible with &[u8].
mem::transmute(bytes)
}

#[inline]
Expand All @@ -29,79 +26,116 @@ pub fn is_verbatim_sep(b: u8) -> bool {
b == b'\\'
}

// In most DOS systems, it is not possible to have more than 26 drive letters.
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
pub fn is_valid_drive_letter(disk: u8) -> bool {
disk.is_ascii_alphabetic()
}

pub fn parse_prefix(path: &OsStr) -> Option<Prefix<'_>> {
use Prefix::{DeviceNS, Disk, Verbatim, VerbatimDisk, VerbatimUNC, UNC};

let path = os_str_as_u8_slice(path);

// \\
if let Some(path) = path.strip_prefix(br"\\") {
// \\?\
if let Some(path) = path.strip_prefix(br"?\") {
// \\?\UNC\server\share
if let Some(path) = path.strip_prefix(br"UNC\") {
let (server, share) = match get_first_two_components(path, is_verbatim_sep) {
Some((server, share)) => unsafe {
(u8_slice_as_os_str(server), u8_slice_as_os_str(share))
},
None => (unsafe { u8_slice_as_os_str(path) }, OsStr::new("")),
};
return Some(VerbatimUNC(server, share));
if let Some(path) = strip_prefix(path, r"\\") {
// \\
if let Some(path) = strip_prefix(path, r"?\") {
// \\?\
if let Some(path) = strip_prefix(path, r"UNC\") {
// \\?\UNC\server\share

let (server, path) = parse_next_component(path, true);
let (share, _) = parse_next_component(path, true);

Some(VerbatimUNC(server, share))
} else {
// \\?\path
match path {
// \\?\C:\path
[c, b':', b'\\', ..] if is_valid_drive_letter(*c) => {
return Some(VerbatimDisk(c.to_ascii_uppercase()));
}
// \\?\cat_pics
_ => {
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
let slice = &path[..idx];
return Some(Verbatim(unsafe { u8_slice_as_os_str(slice) }));
}
let (prefix, _) = parse_next_component(path, true);

// in verbatim paths only recognize an exact drive prefix
if let Some(drive) = parse_drive_exact(prefix) {
// \\?\C:
Some(VerbatimDisk(drive))
} else {
// \\?\prefix
Some(Verbatim(prefix))
}
}
} else if let Some(path) = path.strip_prefix(b".\\") {
} else if let Some(path) = strip_prefix(path, r".\") {
// \\.\COM42
let idx = path.iter().position(|&b| b == b'\\').unwrap_or(path.len());
let slice = &path[..idx];
return Some(DeviceNS(unsafe { u8_slice_as_os_str(slice) }));
}
match get_first_two_components(path, is_sep_byte) {
Some((server, share)) if !server.is_empty() && !share.is_empty() => {
let (prefix, _) = parse_next_component(path, false);
Some(DeviceNS(prefix))
} else {
let (server, path) = parse_next_component(path, false);
let (share, _) = parse_next_component(path, false);

if !server.is_empty() && !share.is_empty() {
// \\server\share
return Some(unsafe { UNC(u8_slice_as_os_str(server), u8_slice_as_os_str(share)) });
Some(UNC(server, share))
} else {
// no valid prefix beginning with "\\" recognized
None
}
_ => {}
}
} else if let [c, b':', ..] = path {
} else if let Some(drive) = parse_drive(path) {
// C:
if is_valid_drive_letter(*c) {
return Some(Disk(c.to_ascii_uppercase()));
}
Some(Disk(drive))
} else {
// no prefix
None
}
None
}

/// Returns the first two path components with predicate `f`.
///
/// The two components returned will be use by caller
/// to construct `VerbatimUNC` or `UNC` Windows path prefix.
///
/// Returns [`None`] if there are no separators in path.
fn get_first_two_components(path: &[u8], f: fn(u8) -> bool) -> Option<(&[u8], &[u8])> {
let idx = path.iter().position(|&x| f(x))?;
// Panic safe
// The max `idx+1` is `path.len()` and `path[path.len()..]` is a valid index.
let (first, path) = (&path[..idx], &path[idx + 1..]);
let idx = path.iter().position(|&x| f(x)).unwrap_or(path.len());
let second = &path[..idx];
Some((first, second))
// Parses a drive prefix, e.g. "C:" and "C:\whatever"
fn parse_drive(prefix: &OsStr) -> Option<u8> {
// In most DOS systems, it is not possible to have more than 26 drive letters.
// See <https://en.wikipedia.org/wiki/Drive_letter_assignment#Common_assignments>.
fn is_valid_drive_letter(drive: &u8) -> bool {
drive.is_ascii_alphabetic()
}

match prefix.bytes() {
[drive, b':', ..] if is_valid_drive_letter(drive) => Some(drive.to_ascii_uppercase()),
_ => None,
}
}

// Parses a drive prefix exactly, e.g. "C:"
fn parse_drive_exact(prefix: &OsStr) -> Option<u8> {
// only parse two bytes: the drive letter and the drive separator
if prefix.len() == 2 { parse_drive(prefix) } else { None }
}

fn strip_prefix<'a>(path: &'a OsStr, prefix: &str) -> Option<&'a OsStr> {
// `path` and `prefix` are valid wtf8 and utf8 encoded slices respectively, `path[prefix.len()]`
// is thus a code point boundary and `path[prefix.len()..]` is a valid wtf8 encoded slice.
match path.bytes().strip_prefix(prefix.as_bytes()) {
Some(path) => unsafe { Some(bytes_as_os_str(path)) },
None => None,
}
}

// Parse the next path component.
//
// Returns the next component and the rest of the path excluding the component and separator.
// Does not recognize `/` as a separator character if `verbatim` is true.
fn parse_next_component(path: &OsStr, verbatim: bool) -> (&OsStr, &OsStr) {
let separator = if verbatim { is_verbatim_sep } else { is_sep_byte };

match path.bytes().iter().position(|&x| separator(x)) {
Some(separator_start) => {
let mut separator_end = separator_start + 1;

// a series of multiple separator characters is treated as a single separator,
// except in verbatim paths
while !verbatim && separator_end < path.len() && separator(path.bytes()[separator_end])
{
separator_end += 1;
}

let component = &path.bytes()[..separator_start];

// Panic safe
// The max `separator_end` is `bytes.len()` and `bytes[bytes.len()..]` is a valid index.
let path = &path.bytes()[separator_end..];

// Safety: `path` is a valid wtf8 encoded slice and each of the separators ('/', '\')
// is encoded in a single byte, therefore `bytes[separator_start]` and
// `bytes[separator_end]` must be code point boundaries and thus
// `bytes[..separator_start]` and `bytes[separator_end..]` are valid wtf8 slices.
unsafe { (bytes_as_os_str(component), bytes_as_os_str(path)) }
}
None => (path, OsStr::new("")),
}
}
39 changes: 31 additions & 8 deletions library/std/src/sys/windows/path/tests.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,44 @@
use super::*;

#[test]
fn test_get_first_two_components() {
fn test_parse_next_component() {
assert_eq!(
get_first_two_components(br"server\share", is_verbatim_sep),
Some((&b"server"[..], &b"share"[..])),
parse_next_component(OsStr::new(r"server\share"), true),
(OsStr::new(r"server"), OsStr::new(r"share"))
);

assert_eq!(
get_first_two_components(br"server\", is_verbatim_sep),
Some((&b"server"[..], &b""[..]))
parse_next_component(OsStr::new(r"server/share"), true),
(OsStr::new(r"server/share"), OsStr::new(r""))
);

assert_eq!(
get_first_two_components(br"\server\", is_verbatim_sep),
Some((&b""[..], &b"server"[..]))
parse_next_component(OsStr::new(r"server/share"), false),
(OsStr::new(r"server"), OsStr::new(r"share"))
);

assert_eq!(get_first_two_components(br"there are no separators here", is_verbatim_sep), None,);
assert_eq!(
parse_next_component(OsStr::new(r"server\"), false),
(OsStr::new(r"server"), OsStr::new(r""))
);

assert_eq!(
parse_next_component(OsStr::new(r"\server\"), false),
(OsStr::new(r""), OsStr::new(r"server\"))
);

assert_eq!(
parse_next_component(OsStr::new(r"servershare"), false),
(OsStr::new(r"servershare"), OsStr::new(""))
);

assert_eq!(
parse_next_component(OsStr::new(r"server/\//\/\\\\/////\/share"), false),
(OsStr::new(r"server"), OsStr::new(r"share"))
);

assert_eq!(
parse_next_component(OsStr::new(r"server\\\\\\\\\\\\\\share"), true),
(OsStr::new(r"server"), OsStr::new(r"\\\\\\\\\\\\\share"))
);
}