@@ -625,11 +625,22 @@ pub const ArgIteratorWasi = struct {
625625};
626626
627627/// Iterator that implements the Windows command-line parsing algorithm.
628+ /// The implementation is intended to be compatible with the post-2008 C runtime,
629+ /// but is *not* intended to be compatible with `CommandLineToArgvW` since
630+ /// `CommandLineToArgvW` uses the pre-2008 parsing rules.
628631///
629- /// This iterator faithfully implements the parsing behavior observed in `CommandLineToArgvW` with
632+ /// This iterator faithfully implements the parsing behavior observed from the C runtime with
630633/// one exception: if the command-line string is empty, the iterator will immediately complete
631- /// without returning any arguments (whereas `CommandLineArgvW` will return a single argument
634+ /// without returning any arguments (whereas the C runtime will return a single argument
632635/// representing the name of the current executable).
636+ ///
637+ /// The essential parts of the algorithm are described in Microsoft's documentation:
638+ ///
639+ /// - https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments
640+ ///
641+ /// David Deley explains some additional undocumented quirks in great detail:
642+ ///
643+ /// - https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES
633644pub const ArgIteratorWindows = struct {
634645 allocator : Allocator ,
635646 /// Owned by the iterator.
@@ -686,6 +697,51 @@ pub const ArgIteratorWindows = struct {
686697 fn emitCharacter (self : * ArgIteratorWindows , char : u8 ) void {
687698 self .buffer [self .end ] = char ;
688699 self .end += 1 ;
700+
701+ // Because we are emitting WTF-8 byte-by-byte, we need to
702+ // check to see if we've emitted two consecutive surrogate
703+ // codepoints that form a valid surrogate pair in order
704+ // to ensure that we're always emitting well-formed WTF-8
705+ // (https://simonsapin.github.io/wtf-8/#concatenating).
706+ //
707+ // If we do have a valid surrogate pair, we need to emit
708+ // the UTF-8 sequence for the codepoint that they encode
709+ // instead of the WTF-8 encoding for the two surrogate pairs
710+ // separately.
711+ //
712+ // This is relevant when dealing with a WTF-16 encoded
713+ // command line like this:
714+ // "<0xD801>"<0xDC37>
715+ // which would get converted to WTF-8 in `cmd_line` as:
716+ // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
717+ // and then after parsing it'd naively get emitted as:
718+ // <0xED><0xA0><0x81><0xED><0xB0><0xB7>
719+ // but instead, we need to recognize the surrogate pair
720+ // and emit the codepoint it encodes, which in this
721+ // example is U+10437 (𐐷), which is encoded in UTF-8 as:
722+ // <0xF0><0x90><0x90><0xB7>
723+ concatSurrogatePair (self );
724+ }
725+
726+ fn concatSurrogatePair (self : * ArgIteratorWindows ) void {
727+ // Surrogate codepoints are always encoded as 3 bytes, so there
728+ // must be 6 bytes for a surrogate pair to exist.
729+ if (self .end - self .start >= 6 ) {
730+ const window = self .buffer [self .end - 6 .. self .end ];
731+ const view = std .unicode .Wtf8View .init (window ) catch return ;
732+ var it = view .iterator ();
733+ var pair : [2 ]u16 = undefined ;
734+ pair [0 ] = std .mem .nativeToLittle (u16 , std .math .cast (u16 , it .nextCodepoint ().? ) orelse return );
735+ if (! std .unicode .utf16IsHighSurrogate (std .mem .littleToNative (u16 , pair [0 ]))) return ;
736+ pair [1 ] = std .mem .nativeToLittle (u16 , std .math .cast (u16 , it .nextCodepoint ().? ) orelse return );
737+ if (! std .unicode .utf16IsLowSurrogate (std .mem .littleToNative (u16 , pair [1 ]))) return ;
738+ // We know we have a valid surrogate pair, so convert
739+ // it to UTF-8, overwriting the surrogate pair's bytes
740+ // and then chop off the extra bytes.
741+ const len = std .unicode .utf16LeToUtf8 (window , & pair ) catch unreachable ;
742+ const delta = 6 - len ;
743+ self .end -= delta ;
744+ }
689745 }
690746
691747 fn yieldArg (self : * ArgIteratorWindows ) [:0 ]const u8 {
@@ -711,69 +767,37 @@ pub const ArgIteratorWindows = struct {
711767 }
712768 };
713769
714- // The essential parts of the algorithm are described in Microsoft's documentation:
715- //
716- // - <https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args?view=msvc-170#parsing-c-command-line-arguments>
717- // - <https://learn.microsoft.com/en-us/windows/win32/api/shellapi/nf-shellapi-commandlinetoargvw>
718- //
719- // David Deley explains some additional undocumented quirks in great detail:
720- //
721- // - <https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULES>
722- //
723- // Code points <= U+0020 terminating an unquoted first argument was discovered independently by
724- // testing and observing the behavior of 'CommandLineToArgvW' on Windows 10.
725-
726770 fn nextWithStrategy (self : * ArgIteratorWindows , comptime strategy : type ) strategy.T {
727771 // The first argument (the executable name) uses different parsing rules.
728772 if (self .index == 0 ) {
729- var char = if (self .cmd_line .len != 0 ) self .cmd_line [0 ] else 0 ;
730- switch (char ) {
731- 0 = > {
732- // Immediately complete the iterator.
733- // 'CommandLineToArgvW' would return the name of the current executable here.
734- return strategy .eof ;
735- },
736- '"' = > {
737- // If the first character is a quote, read everything until the next quote (then
738- // skip that quote), or until the end of the string.
739- self .index += 1 ;
740- while (true ) : (self .index += 1 ) {
741- char = if (self .index != self .cmd_line .len ) self .cmd_line [self .index ] else 0 ;
742- switch (char ) {
743- 0 = > {
744- return strategy .yieldArg (self );
745- },
746- '"' = > {
747- self .index += 1 ;
748- return strategy .yieldArg (self );
749- },
750- else = > {
751- strategy .emitCharacter (self , char );
752- },
753- }
754- }
755- },
756- else = > {
757- // Otherwise, read everything until the next space or ASCII control character
758- // (not including DEL) (then skip that character), or until the end of the
759- // string. This means that if the command-line string starts with one of these
760- // characters, the first returned argument will be the empty string.
761- while (true ) : (self .index += 1 ) {
762- char = if (self .index != self .cmd_line .len ) self .cmd_line [self .index ] else 0 ;
763- switch (char ) {
764- 0 = > {
765- return strategy .yieldArg (self );
766- },
767- '\x01 ' ... ' ' = > {
768- self .index += 1 ;
769- return strategy .yieldArg (self );
770- },
771- else = > {
772- strategy .emitCharacter (self , char );
773- },
773+ if (self .cmd_line .len == 0 or self .cmd_line [0 ] == 0 ) {
774+ // Immediately complete the iterator.
775+ // The C runtime would return the name of the current executable here.
776+ return strategy .eof ;
777+ }
778+
779+ var inside_quotes = false ;
780+ while (true ) : (self .index += 1 ) {
781+ const char = if (self .index != self .cmd_line .len ) self .cmd_line [self .index ] else 0 ;
782+ switch (char ) {
783+ 0 = > {
784+ return strategy .yieldArg (self );
785+ },
786+ '"' = > {
787+ inside_quotes = ! inside_quotes ;
788+ },
789+ ' ' , '\t ' = > {
790+ if (inside_quotes )
791+ strategy .emitCharacter (self , char )
792+ else {
793+ self .index += 1 ;
794+ return strategy .yieldArg (self );
774795 }
775- }
776- },
796+ },
797+ else = > {
798+ strategy .emitCharacter (self , char );
799+ },
800+ }
777801 }
778802 }
779803
@@ -791,9 +815,10 @@ pub const ArgIteratorWindows = struct {
791815 //
792816 // - The end of the string always terminates the current argument.
793817 // - When not in 'inside_quotes' mode, a space or tab terminates the current argument.
794- // - 2n backslashes followed by a quote emit n backslashes. If in 'inside_quotes' and the
795- // quote is immediately followed by a second quote, one quote is emitted and the other is
796- // skipped, otherwise, the quote is skipped. Finally, 'inside_quotes' is toggled.
818+ // - 2n backslashes followed by a quote emit n backslashes (note: n can be zero).
819+ // If in 'inside_quotes' and the quote is immediately followed by a second quote,
820+ // one quote is emitted and the other is skipped, otherwise, the quote is skipped
821+ // and 'inside_quotes' is toggled.
797822 // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote.
798823 // - n backslashes not followed by a quote emit n backslashes.
799824 var backslash_count : usize = 0 ;
@@ -826,8 +851,9 @@ pub const ArgIteratorWindows = struct {
826851 {
827852 strategy .emitCharacter (self , '"' );
828853 self .index += 1 ;
854+ } else {
855+ inside_quotes = ! inside_quotes ;
829856 }
830- inside_quotes = ! inside_quotes ;
831857 }
832858 },
833859 '\\ ' = > {
@@ -1215,10 +1241,10 @@ test ArgIteratorWindows {
12151241 // Separators
12161242 try t ("aa bb cc" , &.{ "aa" , "bb" , "cc" });
12171243 try t ("aa\t bb\t cc" , &.{ "aa" , "bb" , "cc" });
1218- try t ("aa\n bb\n cc" , &.{ "aa" , "bb \ n cc" });
1219- try t ("aa\r \n bb\r \n cc" , &.{ "aa" , " \ n bb\r \n cc" });
1220- try t ("aa\r bb\r cc" , &.{ "aa" , "bb \ r cc" });
1221- try t ("aa\x07 bb\x07 cc" , &.{ "aa" , "bb \ x07 cc" });
1244+ try t ("aa\n bb\n cc" , &.{"aa\n bb \ n cc" });
1245+ try t ("aa\r \n bb\r \n cc" , &.{"aa\r \ n bb\r \n cc" });
1246+ try t ("aa\r bb\r cc" , &.{"aa\r bb \ r cc" });
1247+ try t ("aa\x07 bb\x07 cc" , &.{"aa\x07 bb \ x07 cc" });
12221248 try t ("aa\x7F bb\x7F cc" , &.{"aa\x7F bb\x7F cc" });
12231249 try t ("aa🦎bb🦎cc" , &.{"aa🦎bb🦎cc" });
12241250
@@ -1227,22 +1253,22 @@ test ArgIteratorWindows {
12271253 try t (" aa bb " , &.{ "" , "aa" , "bb" });
12281254 try t ("\t\t " , &.{"" });
12291255 try t ("\t\t aa\t\t bb\t\t " , &.{ "" , "aa" , "bb" });
1230- try t ("\n\n " , &.{ "" , " \n " });
1231- try t ("\n\n aa\n\n bb\n\n " , &.{ "" , " \ n aa\n\n bb\n\n " });
1256+ try t ("\n\n " , &.{" \n\n " });
1257+ try t ("\n\n aa\n\n bb\n\n " , &.{" \n\ n aa\n\n bb\n\n " });
12321258
12331259 // Executable name with quotes/backslashes
12341260 try t ("\" aa bb\t cc\n dd\" " , &.{"aa bb\t cc\n dd" });
12351261 try t ("\" " , &.{"" });
12361262 try t ("\"\" " , &.{"" });
1237- try t ("\"\"\" " , &.{ "" , "" });
1238- try t ("\"\"\"\" " , &.{ "" , "" });
1239- try t ("\"\"\"\"\" " , &.{ "" , " \" " });
1240- try t ("aa\" bb\" cc\" dd" , &.{"aa \" bb \" cc \" dd " });
1241- try t ("aa\" bb cc\" dd" , &.{ "aa \" bb" , " ccdd" });
1242- try t ("\" aa\\ \" bb\" " , &.{ "aa\\ " , " bb" });
1263+ try t ("\"\"\" " , &.{"" });
1264+ try t ("\"\"\"\" " , &.{"" });
1265+ try t ("\"\"\"\"\" " , &.{"" });
1266+ try t ("aa\" bb\" cc\" dd" , &.{"aabbccdd " });
1267+ try t ("aa\" bb cc\" dd" , &.{"aabb ccdd" });
1268+ try t ("\" aa\\ \" bb\" " , &.{"aa\\ bb" });
12431269 try t ("\" aa\\\\ \" " , &.{"aa\\\\ " });
1244- try t ("aa\\ \" bb" , &.{"aa\\ \" bb" });
1245- try t ("aa\\\\ \" bb" , &.{"aa\\\\ \" bb" });
1270+ try t ("aa\\ \" bb" , &.{"aa\\ bb" });
1271+ try t ("aa\\\\ \" bb" , &.{"aa\\\\ bb" });
12461272
12471273 // Arguments with quotes/backslashes
12481274 try t (". \" aa bb\t cc\n dd\" " , &.{ "." , "aa bb\t cc\n dd" });
@@ -1252,29 +1278,66 @@ test ArgIteratorWindows {
12521278 try t (". \"\" " , &.{ "." , "" });
12531279 try t (". \"\"\" " , &.{ "." , "\" " });
12541280 try t (". \"\"\"\" " , &.{ "." , "\" " });
1255- try t (". \"\"\"\"\" " , &.{ "." , "\" " });
1281+ try t (". \"\"\"\"\" " , &.{ "." , "\"\" " });
12561282 try t (". \"\"\"\"\"\" " , &.{ "." , "\"\" " });
12571283 try t (". \" \" " , &.{ "." , " " });
12581284 try t (". \" \"\" " , &.{ "." , " \" " });
12591285 try t (". \" \"\"\" " , &.{ "." , " \" " });
1260- try t (". \" \"\"\"\" " , &.{ "." , " \" " });
1286+ try t (". \" \"\"\"\" " , &.{ "." , " \"\" " });
12611287 try t (". \" \"\"\"\"\" " , &.{ "." , " \"\" " });
1262- try t (". \" \"\"\"\"\"\" " , &.{ "." , " \"\" " });
1288+ try t (". \" \"\"\"\"\"\" " , &.{ "." , " \"\"\" " });
12631289 try t (". \\ \" " , &.{ "." , "\" " });
12641290 try t (". \\ \"\" " , &.{ "." , "\" " });
12651291 try t (". \\ \"\"\" " , &.{ "." , "\" " });
12661292 try t (". \\ \"\"\"\" " , &.{ "." , "\"\" " });
12671293 try t (". \\ \"\"\"\"\" " , &.{ "." , "\"\" " });
1268- try t (". \\ \"\"\"\"\"\" " , &.{ "." , "\"\" " });
1294+ try t (". \\ \"\"\"\"\"\" " , &.{ "." , "\"\"\" " });
12691295 try t (". \" \\ \" " , &.{ "." , " \" " });
12701296 try t (". \" \\ \"\" " , &.{ "." , " \" " });
12711297 try t (". \" \\ \"\"\" " , &.{ "." , " \"\" " });
12721298 try t (". \" \\ \"\"\"\" " , &.{ "." , " \"\" " });
1273- try t (". \" \\ \"\"\"\"\" " , &.{ "." , " \"\" " });
1299+ try t (". \" \\ \"\"\"\"\" " , &.{ "." , " \"\"\" " });
12741300 try t (". \" \\ \"\"\"\"\"\" " , &.{ "." , " \"\"\" " });
12751301 try t (". aa\\ bb\\\\ cc\\\\\\ dd" , &.{ "." , "aa\\ bb\\\\ cc\\\\\\ dd" });
12761302 try t (". \\\\\\ \" aa bb\" " , &.{ "." , "\\ \" aa" , "bb" });
12771303 try t (". \\\\\\\\ \" aa bb\" " , &.{ "." , "\\\\ aa bb" });
1304+
1305+ // From https://learn.microsoft.com/en-us/cpp/cpp/main-function-command-line-args#results-of-parsing-command-lines
1306+ try t (
1307+ \\foo.exe "abc" d e
1308+ , &.{ "foo.exe" , "abc" , "d" , "e" });
1309+ try t (
1310+ \\foo.exe a\\b d"e f"g h
1311+ , &.{ "foo.exe" , "a\\\\ b" , "de fg" , "h" });
1312+ try t (
1313+ \\foo.exe a\\\"b c d
1314+ , &.{ "foo.exe" , "a\\ \" b" , "c" , "d" });
1315+ try t (
1316+ \\foo.exe a\\\\"b c" d e
1317+ , &.{ "foo.exe" , "a\\\\ b c" , "d" , "e" });
1318+ try t (
1319+ \\foo.exe a"b"" c d
1320+ , &.{ "foo.exe" , "ab\" c d" });
1321+
1322+ // From https://daviddeley.com/autohotkey/parameters/parameters.htm#WINCRULESEX
1323+ try t ("foo.exe CallMeIshmael" , &.{ "foo.exe" , "CallMeIshmael" });
1324+ try t ("foo.exe \" Call Me Ishmael\" " , &.{ "foo.exe" , "Call Me Ishmael" });
1325+ try t ("foo.exe Cal\" l Me I\" shmael" , &.{ "foo.exe" , "Call Me Ishmael" });
1326+ try t ("foo.exe CallMe\\ \" Ishmael" , &.{ "foo.exe" , "CallMe\" Ishmael" });
1327+ try t ("foo.exe \" CallMe\\ \" Ishmael\" " , &.{ "foo.exe" , "CallMe\" Ishmael" });
1328+ try t ("foo.exe \" Call Me Ishmael\\\\ \" " , &.{ "foo.exe" , "Call Me Ishmael\\ " });
1329+ try t ("foo.exe \" CallMe\\\\\\ \" Ishmael\" " , &.{ "foo.exe" , "CallMe\\ \" Ishmael" });
1330+ try t ("foo.exe a\\\\\\ b" , &.{ "foo.exe" , "a\\\\\\ b" });
1331+ try t ("foo.exe \" a\\\\\\ b\" " , &.{ "foo.exe" , "a\\\\\\ b" });
1332+
1333+ // Surrogate pair encoding of 𐐷 separated by quotes.
1334+ // Encoded as WTF-16:
1335+ // "<0xD801>"<0xDC37>
1336+ // Encoded as WTF-8:
1337+ // "<0xED><0xA0><0x81>"<0xED><0xB0><0xB7>
1338+ // During parsing, the quotes drop out and the surrogate pair
1339+ // should end up encoded as its normal UTF-8 representation.
1340+ try t ("foo.exe \" \xed\xa0\x81 \" \xed\xb0\xb7 " , &.{ "foo.exe" , "𐐷" });
12781341}
12791342
12801343fn testArgIteratorWindows (cmd_line : []const u8 , expected_args : []const []const u8 ) ! void {
0 commit comments