From 51dcd7fbe8c8db707f978db4dfee86abdccaf4f2 Mon Sep 17 00:00:00 2001 From: Billy Keyes Date: Sat, 24 Apr 2021 13:00:15 -0700 Subject: [PATCH 1/3] Fix parsing for file names with spaces Git does not quote file names containing spaces when generating header lines. However, I misread the Git implementation and thought it used spaces as a name terminator by default, implying that names with spaces would be quoted. Rewrite the the name parsing code to better match Git. Specifically, when both names in a header are not quoted, test all splits to see if any of them produce two equal names. Also account for spaces in file names when parsing file metadata. Finally, allow trailing spaces on file names, which seem to be allowed by Git (although this sounds like a terrible idea to me.) --- README.md | 1 + gitdiff/file_header.go | 123 +++++++++++++++++++++++++++++------- gitdiff/file_header_test.go | 39 +++++++++--- 3 files changed, 132 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index b6d7b23..1f45a49 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ likely undiscovered bugs. - Numbers immediately followed by non-numeric characters - Trailing characters on a line after valid or expected content + - Malformed file header lines (lines that start with `diff --git`) 2. Errors for invalid input are generally more verbose and specific than those from `git apply`. diff --git a/gitdiff/file_header.go b/gitdiff/file_header.go index fadfc4e..6b7e98f 100644 --- a/gitdiff/file_header.go +++ b/gitdiff/file_header.go @@ -172,24 +172,89 @@ func (p *parser) ParseTraditionalFileHeader() (*File, error) { // If the names in the header do not match because the patch is a rename, // return an empty default name. func parseGitHeaderName(header string) (string, error) { - firstName, n, err := parseName(header, -1, 1) - if err != nil { - return "", err + if len(header) == 0 { + return "", nil } - if n < len(header) && (header[n] == ' ' || header[n] == '\t') { - n++ - } + var err error + var first, second string - secondName, _, err := parseName(header[n:], -1, 1) - if err != nil { - return "", err + // there are 4 cases to account for: + // + // 1) unquoted unquoted + // 2) unquoted "quoted" + // 3) "quoted" unquoted + // 4) "quoted" "quoted" + // + quote := strings.IndexByte(header, '"') + switch { + case quote < 0: + // case 1 + first = header + + case quote > 0: + // case 2 + first = header[:quote-1] + if !isSpace(header[quote-1]) { + return "", fmt.Errorf("missing separator") + } + + second, _, err = parseQuotedName(header[quote:]) + if err != nil { + return "", err + } + + case quote == 0: + // case 3 or case 4 + var n int + first, n, err = parseQuotedName(header) + if err != nil { + return "", err + } + + // git accepts multiple spaces after a quoted name, but not after an + // unquoated name, since the name might end with one or more spaces + for n < len(header) && isSpace(header[n]) { + n++ + } + + if n < len(header) && header[n] == '"' { + second, _, err = parseQuotedName(header[n:]) + } else { + second, _, err = parseUnquotedName(header[n:], 0) + } + if err != nil { + return "", err + } } - if firstName != secondName { + first = trimTreePrefix(first, 1) + if second != "" { + if first == trimTreePrefix(second, 1) { + return first, nil + } return "", nil } - return firstName, nil + + // at this point, both names are unquoted (case 1) + // since names may contain spaces, we can't use a known separator + // instead, look for a split that produces two equal names + + end := strings.IndexByte(first, '\n') + if end < 0 { + end = len(first) + } + + for i := 0; i < end-1; i++ { + if !isSpace(first[i]) { + continue + } + second = trimTreePrefix(first[i+1:end], 1) + if name := first[:i]; name == second { + return name, nil + } + } + return "", nil } // parseGitHeaderData parses a single line of metadata from a Git file header. @@ -283,25 +348,25 @@ func parseGitHeaderCreatedMode(f *File, line, defaultName string) error { func parseGitHeaderCopyFrom(f *File, line, defaultName string) (err error) { f.IsCopy = true - f.OldName, _, err = parseName(line, -1, 0) + f.OldName, _, err = parseName(line, 0, 0) return } func parseGitHeaderCopyTo(f *File, line, defaultName string) (err error) { f.IsCopy = true - f.NewName, _, err = parseName(line, -1, 0) + f.NewName, _, err = parseName(line, 0, 0) return } func parseGitHeaderRenameFrom(f *File, line, defaultName string) (err error) { f.IsRename = true - f.OldName, _, err = parseName(line, -1, 0) + f.OldName, _, err = parseName(line, 0, 0) return } func parseGitHeaderRenameTo(f *File, line, defaultName string) (err error) { f.IsRename = true - f.NewName, _, err = parseName(line, -1, 0) + f.NewName, _, err = parseName(line, 0, 0) return } @@ -349,14 +414,14 @@ func parseMode(s string) (os.FileMode, error) { // parseName extracts a file name from the start of a string and returns the // name and the index of the first character after the name. If the name is -// unquoted and term is non-negative, parsing stops at the first occurrence of -// term. Otherwise parsing of unquoted names stops at the first space or tab. +// unquoted and term is non-zero, parsing stops at the first occurrence of +// term. // // If the name is exactly "/dev/null", no further processing occurs. Otherwise, // if dropPrefix is greater than zero, that number of prefix components // separated by forward slashes are dropped from the name and any duplicate // slashes are collapsed. -func parseName(s string, term rune, dropPrefix int) (name string, n int, err error) { +func parseName(s string, term byte, dropPrefix int) (name string, n int, err error) { if len(s) > 0 && s[0] == '"' { name, n, err = parseQuotedName(s) } else { @@ -387,15 +452,12 @@ func parseQuotedName(s string) (name string, n int, err error) { return name, n, err } -func parseUnquotedName(s string, term rune) (name string, n int, err error) { +func parseUnquotedName(s string, term byte) (name string, n int, err error) { for n = 0; n < len(s); n++ { if s[n] == '\n' { break } - if term >= 0 && rune(s[n]) == term { - break - } - if term < 0 && (s[n] == ' ' || s[n] == '\t') { + if term > 0 && s[n] == term { break } } @@ -440,6 +502,17 @@ func cleanName(name string, drop int) string { return b.String() } +// trimTreePrefix removes up to n leading directory components from name. +func trimTreePrefix(name string, n int) string { + i := 0 + for ; i < len(name) && n > 0; i++ { + if name[i] == '/' { + n-- + } + } + return name[i:] +} + // hasEpochTimestamp returns true if the string ends with a POSIX-formatted // timestamp for the UNIX epoch after a tab character. According to git, this // is used by GNU diff to mark creations and deletions. @@ -468,3 +541,7 @@ func hasEpochTimestamp(s string) bool { } return true } + +func isSpace(c byte) bool { + return c == ' ' || c == '\t' || c == '\n' +} diff --git a/gitdiff/file_header_test.go b/gitdiff/file_header_test.go index 52adda5..421e1e1 100644 --- a/gitdiff/file_header_test.go +++ b/gitdiff/file_header_test.go @@ -310,7 +310,7 @@ func TestCleanName(t *testing.T) { func TestParseName(t *testing.T) { tests := map[string]struct { Input string - Term rune + Term byte Drop int Output string N int @@ -334,14 +334,17 @@ func TestParseName(t *testing.T) { "dropPrefix": { Input: "a/dir/file.txt", Drop: 1, Output: "dir/file.txt", N: 14, }, - "multipleNames": { - Input: "dir/a.txt dir/b.txt", Term: -1, Output: "dir/a.txt", N: 9, + "unquotedWithSpaces": { + Input: "dir/with spaces.txt", Output: "dir/with spaces.txt", N: 19, + }, + "unquotedWithTrailingSpaces": { + Input: "dir/with spaces.space ", Output: "dir/with spaces.space ", N: 23, }, "devNull": { Input: "/dev/null", Term: '\t', Drop: 1, Output: "/dev/null", N: 9, }, - "newlineAlwaysSeparates": { - Input: "dir/file.txt\n", Term: 0, Output: "dir/file.txt", N: 12, + "newlineSeparates": { + Input: "dir/file.txt\n", Output: "dir/file.txt", N: 12, }, "emptyString": { Input: "", Err: true, @@ -630,9 +633,29 @@ func TestParseGitHeaderName(t *testing.T) { Input: "a/dir/foo.txt b/dir/bar.txt", Output: "", }, - "missingSecondName": { - Input: "a/dir/foo.txt", - Err: true, + "matchingNamesWithSpaces": { + Input: "a/dir/file with spaces.txt b/dir/file with spaces.txt", + Output: "dir/file with spaces.txt", + }, + "matchingNamesWithTrailingSpaces": { + Input: "a/dir/spaces b/dir/spaces ", + Output: "dir/spaces ", + }, + "matchingNamesQuoted": { + Input: `"a/dir/\"quotes\".txt" "b/dir/\"quotes\".txt"`, + Output: `dir/"quotes".txt`, + }, + "matchingNamesFirstQuoted": { + Input: `"a/dir/file.txt" b/dir/file.txt`, + Output: "dir/file.txt", + }, + "matchingNamesSecondQuoted": { + Input: `a/dir/file.txt "b/dir/file.txt"`, + Output: "dir/file.txt", + }, + "noSecondName": { + Input: "a/dir/foo.txt", + Output: "", }, "invalidName": { Input: `"a/dir/file.txt b/dir/file.txt`, From a2325ab500998afeb32d5db55f15b2c0d840d64c Mon Sep 17 00:00:00 2001 From: Billy Keyes Date: Sat, 24 Apr 2021 14:53:45 -0700 Subject: [PATCH 2/3] Consistent behavior with missing second name --- gitdiff/file_header.go | 23 +++++++++++------------ gitdiff/file_header_test.go | 4 ++++ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/gitdiff/file_header.go b/gitdiff/file_header.go index 6b7e98f..fef1b69 100644 --- a/gitdiff/file_header.go +++ b/gitdiff/file_header.go @@ -172,6 +172,7 @@ func (p *parser) ParseTraditionalFileHeader() (*File, error) { // If the names in the header do not match because the patch is a rename, // return an empty default name. func parseGitHeaderName(header string) (string, error) { + header = strings.TrimSuffix(header, "\n") if len(header) == 0 { return "", nil } @@ -217,14 +218,17 @@ func parseGitHeaderName(header string) (string, error) { for n < len(header) && isSpace(header[n]) { n++ } + if n == len(header) { + return "", nil + } - if n < len(header) && header[n] == '"' { + if header[n] == '"' { second, _, err = parseQuotedName(header[n:]) + if err != nil { + return "", err + } } else { - second, _, err = parseUnquotedName(header[n:], 0) - } - if err != nil { - return "", err + second = header[n:] } } @@ -240,16 +244,11 @@ func parseGitHeaderName(header string) (string, error) { // since names may contain spaces, we can't use a known separator // instead, look for a split that produces two equal names - end := strings.IndexByte(first, '\n') - if end < 0 { - end = len(first) - } - - for i := 0; i < end-1; i++ { + for i := 0; i < len(first)-1; i++ { if !isSpace(first[i]) { continue } - second = trimTreePrefix(first[i+1:end], 1) + second = trimTreePrefix(first[i+1:], 1) if name := first[:i]; name == second { return name, nil } diff --git a/gitdiff/file_header_test.go b/gitdiff/file_header_test.go index 421e1e1..99cfed3 100644 --- a/gitdiff/file_header_test.go +++ b/gitdiff/file_header_test.go @@ -657,6 +657,10 @@ func TestParseGitHeaderName(t *testing.T) { Input: "a/dir/foo.txt", Output: "", }, + "noSecondNameQuoted": { + Input: `"a/dir/foo.txt"`, + Output: "", + }, "invalidName": { Input: `"a/dir/file.txt b/dir/file.txt`, Err: true, From a346bfe492f7bb36288616241ac1ffa4c45e91d8 Mon Sep 17 00:00:00 2001 From: Billy Keyes Date: Sat, 24 Apr 2021 14:57:47 -0700 Subject: [PATCH 3/3] Fix comment typo --- gitdiff/file_header.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gitdiff/file_header.go b/gitdiff/file_header.go index fef1b69..e3185bd 100644 --- a/gitdiff/file_header.go +++ b/gitdiff/file_header.go @@ -214,7 +214,7 @@ func parseGitHeaderName(header string) (string, error) { } // git accepts multiple spaces after a quoted name, but not after an - // unquoated name, since the name might end with one or more spaces + // unquoted name, since the name might end with one or more spaces for n < len(header) && isSpace(header[n]) { n++ }