Skip to content

Commit b78b268

Browse files
authored
Merge pull request #19655 from squeek502/windows-argv-post-2008
ArgIteratorWindows: Match post-2008 C runtime rather than `CommandLineToArgvW`
2 parents ff18103 + cffe199 commit b78b268

File tree

8 files changed

+502
-84
lines changed

8 files changed

+502
-84
lines changed

lib/std/process.zig

Lines changed: 147 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -625,11 +625,22 @@ pub const ArgIteratorWasi = struct {
625625
};
626626

627627
/// Iterator that implements the Windows command-line parsing algorithm.
628+
/// The implementation is intended to be compatible with the post-2008 C runtime,
629+
/// but is *not* intended to be compatible with `CommandLineToArgvW` since
630+
/// `CommandLineToArgvW` uses the pre-2008 parsing rules.
628631
///
629-
/// This iterator faithfully implements the parsing behavior observed in `CommandLineToArgvW` with
632+
/// This iterator faithfully implements the parsing behavior observed from the C runtime with
630633
/// one exception: if the command-line string is empty, the iterator will immediately complete
631-
/// without returning any arguments (whereas `CommandLineArgvW` will return a single argument
634+
/// without returning any arguments (whereas the C runtime will return a single argument
632635
/// representing the name of the current executable).
636+
///
637+
/// The essential parts of the algorithm are described in Microsoft's documentation:
638+
///
639+
/// - https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments
640+
///
641+
/// David Deley explains some additional undocumented quirks in great detail:
642+
///
643+
/// - https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES
633644
pub const ArgIteratorWindows = struct {
634645
allocator: Allocator,
635646
/// Owned by the iterator.
@@ -686,6 +697,51 @@ pub const ArgIteratorWindows = struct {
686697
fn emitCharacter(self: *ArgIteratorWindows, char: u8) void {
687698
self.buffer[self.end] = char;
688699
self.end += 1;
700+
701+
// Because we are emitting WTF-8 byte-by-byte, we need to
702+
// check to see if we've emitted two consecutive surrogate
703+
// codepoints that form a valid surrogate pair in order
704+
// to ensure that we're always emitting well-formed WTF-8
705+
// (https://simonsapin.github.io/wtf-8/#concatenating).
706+
//
707+
// If we do have a valid surrogate pair, we need to emit
708+
// the UTF-8 sequence for the codepoint that they encode
709+
// instead of the WTF-8 encoding for the two surrogate pairs
710+
// separately.
711+
//
712+
// This is relevant when dealing with a WTF-16 encoded
713+
// command line like this:
714+
// "<0xD801>"<0xDC37>
715+
// which would get converted to WTF-8 in `cmd_line` as:
716+
// "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
717+
// and then after parsing it'd naively get emitted as:
718+
// <0xED><0xA0><0x81><0xED><0xB0><0xB7>
719+
// but instead, we need to recognize the surrogate pair
720+
// and emit the codepoint it encodes, which in this
721+
// example is U+10437 (𐐷), which is encoded in UTF-8 as:
722+
// <0xF0><0x90><0x90><0xB7>
723+
concatSurrogatePair(self);
724+
}
725+
726+
fn concatSurrogatePair(self: *ArgIteratorWindows) void {
727+
// Surrogate codepoints are always encoded as 3 bytes, so there
728+
// must be 6 bytes for a surrogate pair to exist.
729+
if (self.end - self.start >= 6) {
730+
const window = self.buffer[self.end - 6 .. self.end];
731+
const view = std.unicode.Wtf8View.init(window) catch return;
732+
var it = view.iterator();
733+
var pair: [2]u16 = undefined;
734+
pair[0] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
735+
if (!std.unicode.utf16IsHighSurrogate(std.mem.littleToNative(u16, pair[0]))) return;
736+
pair[1] = std.mem.nativeToLittle(u16, std.math.cast(u16, it.nextCodepoint().?) orelse return);
737+
if (!std.unicode.utf16IsLowSurrogate(std.mem.littleToNative(u16, pair[1]))) return;
738+
// We know we have a valid surrogate pair, so convert
739+
// it to UTF-8, overwriting the surrogate pair's bytes
740+
// and then chop off the extra bytes.
741+
const len = std.unicode.utf16LeToUtf8(window, &pair) catch unreachable;
742+
const delta = 6 - len;
743+
self.end -= delta;
744+
}
689745
}
690746

691747
fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
@@ -711,69 +767,37 @@ pub const ArgIteratorWindows = struct {
711767
}
712768
};
713769

714-
// The essential parts of the algorithm are described in Microsoft's documentation:
715-
//
716-
// - <https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments>
717-
// - <https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw>
718-
//
719-
// David Deley explains some additional undocumented quirks in great detail:
720-
//
721-
// - <https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES>
722-
//
723-
// Code points <= U+0020 terminating an unquoted first argument was discovered independently by
724-
// testing and observing the behavior of 'CommandLineToArgvW' on Windows 10.
725-
726770
fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
727771
// The first argument (the executable name) uses different parsing rules.
728772
if (self.index == 0) {
729-
var char = if (self.cmd_line.len != 0) self.cmd_line[0] else 0;
730-
switch (char) {
731-
0 => {
732-
// Immediately complete the iterator.
733-
// 'CommandLineToArgvW' would return the name of the current executable here.
734-
return strategy.eof;
735-
},
736-
'"' => {
737-
// If the first character is a quote, read everything until the next quote (then
738-
// skip that quote), or until the end of the string.
739-
self.index += 1;
740-
while (true) : (self.index += 1) {
741-
char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
742-
switch (char) {
743-
0 => {
744-
return strategy.yieldArg(self);
745-
},
746-
'"' => {
747-
self.index += 1;
748-
return strategy.yieldArg(self);
749-
},
750-
else => {
751-
strategy.emitCharacter(self, char);
752-
},
753-
}
754-
}
755-
},
756-
else => {
757-
// Otherwise, read everything until the next space or ASCII control character
758-
// (not including DEL) (then skip that character), or until the end of the
759-
// string. This means that if the command-line string starts with one of these
760-
// characters, the first returned argument will be the empty string.
761-
while (true) : (self.index += 1) {
762-
char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
763-
switch (char) {
764-
0 => {
765-
return strategy.yieldArg(self);
766-
},
767-
'\x01'...' ' => {
768-
self.index += 1;
769-
return strategy.yieldArg(self);
770-
},
771-
else => {
772-
strategy.emitCharacter(self, char);
773-
},
773+
if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
774+
// Immediately complete the iterator.
775+
// The C runtime would return the name of the current executable here.
776+
return strategy.eof;
777+
}
778+
779+
var inside_quotes = false;
780+
while (true) : (self.index += 1) {
781+
const char = if (self.index != self.cmd_line.len) self.cmd_line[self.index] else 0;
782+
switch (char) {
783+
0 => {
784+
return strategy.yieldArg(self);
785+
},
786+
'"' => {
787+
inside_quotes = !inside_quotes;
788+
},
789+
' ', '\t' => {
790+
if (inside_quotes)
791+
strategy.emitCharacter(self, char)
792+
else {
793+
self.index += 1;
794+
return strategy.yieldArg(self);
774795
}
775-
}
776-
},
796+
},
797+
else => {
798+
strategy.emitCharacter(self, char);
799+
},
800+
}
777801
}
778802
}
779803

@@ -791,9 +815,10 @@ pub const ArgIteratorWindows = struct {
791815
//
792816
// - The end of the string always terminates the current argument.
793817
// - When not in 'inside_quotes' mode, a space or tab terminates the current argument.
794-
// - 2n backslashes followed by a quote emit n backslashes. If in 'inside_quotes' and the
795-
// quote is immediately followed by a second quote, one quote is emitted and the other is
796-
// skipped, otherwise, the quote is skipped. Finally, 'inside_quotes' is toggled.
818+
// - 2n backslashes followed by a quote emit n backslashes (note: n can be zero).
819+
// If in 'inside_quotes' and the quote is immediately followed by a second quote,
820+
// one quote is emitted and the other is skipped, otherwise, the quote is skipped
821+
// and 'inside_quotes' is toggled.
797822
// - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote.
798823
// - n backslashes not followed by a quote emit n backslashes.
799824
var backslash_count: usize = 0;
@@ -826,8 +851,9 @@ pub const ArgIteratorWindows = struct {
826851
{
827852
strategy.emitCharacter(self, '"');
828853
self.index += 1;
854+
} else {
855+
inside_quotes = !inside_quotes;
829856
}
830-
inside_quotes = !inside_quotes;
831857
}
832858
},
833859
'\\' => {
@@ -1215,10 +1241,10 @@ test ArgIteratorWindows {
12151241
// Separators
12161242
try t("aa bb cc", &.{ "aa", "bb", "cc" });
12171243
try t("aa\tbb\tcc", &.{ "aa", "bb", "cc" });
1218-
try t("aa\nbb\ncc", &.{ "aa", "bb\ncc" });
1219-
try t("aa\r\nbb\r\ncc", &.{ "aa", "\nbb\r\ncc" });
1220-
try t("aa\rbb\rcc", &.{ "aa", "bb\rcc" });
1221-
try t("aa\x07bb\x07cc", &.{ "aa", "bb\x07cc" });
1244+
try t("aa\nbb\ncc", &.{"aa\nbb\ncc"});
1245+
try t("aa\r\nbb\r\ncc", &.{"aa\r\nbb\r\ncc"});
1246+
try t("aa\rbb\rcc", &.{"aa\rbb\rcc"});
1247+
try t("aa\x07bb\x07cc", &.{"aa\x07bb\x07cc"});
12221248
try t("aa\x7Fbb\x7Fcc", &.{"aa\x7Fbb\x7Fcc"});
12231249
try t("aa🦎bb🦎cc", &.{"aa🦎bb🦎cc"});
12241250

@@ -1227,22 +1253,22 @@ test ArgIteratorWindows {
12271253
try t(" aa bb ", &.{ "", "aa", "bb" });
12281254
try t("\t\t", &.{""});
12291255
try t("\t\taa\t\tbb\t\t", &.{ "", "aa", "bb" });
1230-
try t("\n\n", &.{ "", "\n" });
1231-
try t("\n\naa\n\nbb\n\n", &.{ "", "\naa\n\nbb\n\n" });
1256+
try t("\n\n", &.{"\n\n"});
1257+
try t("\n\naa\n\nbb\n\n", &.{"\n\naa\n\nbb\n\n"});
12321258

12331259
// Executable name with quotes/backslashes
12341260
try t("\"aa bb\tcc\ndd\"", &.{"aa bb\tcc\ndd"});
12351261
try t("\"", &.{""});
12361262
try t("\"\"", &.{""});
1237-
try t("\"\"\"", &.{ "", "" });
1238-
try t("\"\"\"\"", &.{ "", "" });
1239-
try t("\"\"\"\"\"", &.{ "", "\"" });
1240-
try t("aa\"bb\"cc\"dd", &.{"aa\"bb\"cc\"dd"});
1241-
try t("aa\"bb cc\"dd", &.{ "aa\"bb", "ccdd" });
1242-
try t("\"aa\\\"bb\"", &.{ "aa\\", "bb" });
1263+
try t("\"\"\"", &.{""});
1264+
try t("\"\"\"\"", &.{""});
1265+
try t("\"\"\"\"\"", &.{""});
1266+
try t("aa\"bb\"cc\"dd", &.{"aabbccdd"});
1267+
try t("aa\"bb cc\"dd", &.{"aabb ccdd"});
1268+
try t("\"aa\\\"bb\"", &.{"aa\\bb"});
12431269
try t("\"aa\\\\\"", &.{"aa\\\\"});
1244-
try t("aa\\\"bb", &.{"aa\\\"bb"});
1245-
try t("aa\\\\\"bb", &.{"aa\\\\\"bb"});
1270+
try t("aa\\\"bb", &.{"aa\\bb"});
1271+
try t("aa\\\\\"bb", &.{"aa\\\\bb"});
12461272

12471273
// Arguments with quotes/backslashes
12481274
try t(". \"aa bb\tcc\ndd\"", &.{ ".", "aa bb\tcc\ndd" });
@@ -1252,29 +1278,66 @@ test ArgIteratorWindows {
12521278
try t(". \"\"", &.{ ".", "" });
12531279
try t(". \"\"\"", &.{ ".", "\"" });
12541280
try t(". \"\"\"\"", &.{ ".", "\"" });
1255-
try t(". \"\"\"\"\"", &.{ ".", "\"" });
1281+
try t(". \"\"\"\"\"", &.{ ".", "\"\"" });
12561282
try t(". \"\"\"\"\"\"", &.{ ".", "\"\"" });
12571283
try t(". \" \"", &.{ ".", " " });
12581284
try t(". \" \"\"", &.{ ".", " \"" });
12591285
try t(". \" \"\"\"", &.{ ".", " \"" });
1260-
try t(". \" \"\"\"\"", &.{ ".", " \"" });
1286+
try t(". \" \"\"\"\"", &.{ ".", " \"\"" });
12611287
try t(". \" \"\"\"\"\"", &.{ ".", " \"\"" });
1262-
try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"" });
1288+
try t(". \" \"\"\"\"\"\"", &.{ ".", " \"\"\"" });
12631289
try t(". \\\"", &.{ ".", "\"" });
12641290
try t(". \\\"\"", &.{ ".", "\"" });
12651291
try t(". \\\"\"\"", &.{ ".", "\"" });
12661292
try t(". \\\"\"\"\"", &.{ ".", "\"\"" });
12671293
try t(". \\\"\"\"\"\"", &.{ ".", "\"\"" });
1268-
try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"" });
1294+
try t(". \\\"\"\"\"\"\"", &.{ ".", "\"\"\"" });
12691295
try t(". \" \\\"", &.{ ".", " \"" });
12701296
try t(". \" \\\"\"", &.{ ".", " \"" });
12711297
try t(". \" \\\"\"\"", &.{ ".", " \"\"" });
12721298
try t(". \" \\\"\"\"\"", &.{ ".", " \"\"" });
1273-
try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"" });
1299+
try t(". \" \\\"\"\"\"\"", &.{ ".", " \"\"\"" });
12741300
try t(". \" \\\"\"\"\"\"\"", &.{ ".", " \"\"\"" });
12751301
try t(". aa\\bb\\\\cc\\\\\\dd", &.{ ".", "aa\\bb\\\\cc\\\\\\dd" });
12761302
try t(". \\\\\\\"aa bb\"", &.{ ".", "\\\"aa", "bb" });
12771303
try t(". \\\\\\\\\"aa bb\"", &.{ ".", "\\\\aa bb" });
1304+
1305+
// From https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args#results-of-parsing-command-lines
1306+
try t(
1307+
\\foo.exe "abc" d e
1308+
, &.{ "foo.exe", "abc", "d", "e" });
1309+
try t(
1310+
\\foo.exe a\\b d"e f"g h
1311+
, &.{ "foo.exe", "a\\\\b", "de fg", "h" });
1312+
try t(
1313+
\\foo.exe a\\\"b c d
1314+
, &.{ "foo.exe", "a\\\"b", "c", "d" });
1315+
try t(
1316+
\\foo.exe a\\\\"b c" d e
1317+
, &.{ "foo.exe", "a\\\\b c", "d", "e" });
1318+
try t(
1319+
\\foo.exe a"b"" c d
1320+
, &.{ "foo.exe", "ab\" c d" });
1321+
1322+
// From https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX
1323+
try t("foo.exe CallMeIshmael", &.{ "foo.exe", "CallMeIshmael" });
1324+
try t("foo.exe \"Call Me Ishmael\"", &.{ "foo.exe", "Call Me Ishmael" });
1325+
try t("foo.exe Cal\"l Me I\"shmael", &.{ "foo.exe", "Call Me Ishmael" });
1326+
try t("foo.exe CallMe\\\"Ishmael", &.{ "foo.exe", "CallMe\"Ishmael" });
1327+
try t("foo.exe \"CallMe\\\"Ishmael\"", &.{ "foo.exe", "CallMe\"Ishmael" });
1328+
try t("foo.exe \"Call Me Ishmael\\\\\"", &.{ "foo.exe", "Call Me Ishmael\\" });
1329+
try t("foo.exe \"CallMe\\\\\\\"Ishmael\"", &.{ "foo.exe", "CallMe\\\"Ishmael" });
1330+
try t("foo.exe a\\\\\\b", &.{ "foo.exe", "a\\\\\\b" });
1331+
try t("foo.exe \"a\\\\\\b\"", &.{ "foo.exe", "a\\\\\\b" });
1332+
1333+
// Surrogate pair encoding of 𐐷 separated by quotes.
1334+
// Encoded as WTF-16:
1335+
// "<0xD801>"<0xDC37>
1336+
// Encoded as WTF-8:
1337+
// "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
1338+
// During parsing, the quotes drop out and the surrogate pair
1339+
// should end up encoded as its normal UTF-8 representation.
1340+
try t("foo.exe \"\xed\xa0\x81\"\xed\xb0\xb7", &.{ "foo.exe", "𐐷" });
12781341
}
12791342

12801343
fn testArgIteratorWindows(cmd_line: []const u8, expected_args: []const []const u8) !void {

test/standalone/build.zig.zon

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@
104104
.windows_spawn = .{
105105
.path = "windows_spawn",
106106
},
107+
.windows_argv = .{
108+
.path = "windows_argv",
109+
},
107110
.self_exe_symlink = .{
108111
.path = "self_exe_symlink",
109112
},
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
Tests that Zig's `std.process.ArgIteratorWindows` is compatible with both the MSVC and MinGW C runtimes' argv splitting algorithms.
2+
3+
The method of testing is:
4+
- Compile a C file with `wmain` as its entry point
5+
- The C `wmain` calls a Zig-implemented `verify` function that takes the `argv` from `wmain` and compares it to the argv gotten from `std.proccess.argsAlloc` (which takes `kernel32.GetCommandLineW()` and splits it)
6+
- The compiled C program is spawned continuously as a child process by the implementation in `fuzz.zig` with randomly generated command lines
7+
+ On Windows, the 'application name' and the 'command line' are disjoint concepts. That is, you can spawn `foo.exe` but set the command line to `bar.exe`, and `CreateProcessW` will spawn `foo.exe` but `argv[0]` will be `bar.exe`. This quirk allows us to test arbitrary `argv[0]` values as well which otherwise wouldn't be possible.
8+
9+
Note: This is intentionally testing against the C runtime argv splitting and *not* [`CommandLineToArgvW`](https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw), since the C runtime argv splitting was updated in 2008 but `CommandLineToArgvW` still uses the pre-2008 algorithm (which differs in both `argv[0]` rules and `""`; see [here](https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESDOC) for details)
10+
11+
---
12+
13+
In addition to being run during `zig build test-standalone`, this test can be run on its own via `zig build test` from within this directory.
14+
15+
When run on its own:
16+
- `-Diterations=<num>` can be used to set the max fuzzing iterations, and `-Diterations=0` can be used to fuzz indefinitely
17+
- `-Dseed=<num>` can be used to set the PRNG seed for fuzz testing. If not provided, then the seed is chosen at random during `build.zig` compilation.
18+
19+
On failure, the number of iterations and the seed can be seen in the failing command, e.g. in `path\to\fuzz.exe path\to\verify-msvc.exe 100 2780392459403250529`, the iterations is `100` and the seed is `2780392459403250529`.

0 commit comments

Comments
 (0)