Skip to content

Fix parsing for file names with spaces #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ likely undiscovered bugs.

- Numbers immediately followed by non-numeric characters
- Trailing characters on a line after valid or expected content
- Malformed file header lines (lines that start with `diff --git`)

2. Errors for invalid input are generally more verbose and specific than those
from `git apply`.
Expand Down
122 changes: 99 additions & 23 deletions gitdiff/file_header.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,24 +172,88 @@ func (p *parser) ParseTraditionalFileHeader() (*File, error) {
// If the names in the header do not match because the patch is a rename,
// return an empty default name.
func parseGitHeaderName(header string) (string, error) {
firstName, n, err := parseName(header, -1, 1)
if err != nil {
return "", err
header = strings.TrimSuffix(header, "\n")
if len(header) == 0 {
return "", nil
}

if n < len(header) && (header[n] == ' ' || header[n] == '\t') {
n++
}
var err error
var first, second string

secondName, _, err := parseName(header[n:], -1, 1)
if err != nil {
return "", err
// there are 4 cases to account for:
//
// 1) unquoted unquoted
// 2) unquoted "quoted"
// 3) "quoted" unquoted
// 4) "quoted" "quoted"
//
quote := strings.IndexByte(header, '"')
switch {
case quote < 0:
// case 1
first = header

case quote > 0:
// case 2
first = header[:quote-1]
if !isSpace(header[quote-1]) {
return "", fmt.Errorf("missing separator")
}

second, _, err = parseQuotedName(header[quote:])
if err != nil {
return "", err
}

case quote == 0:
// case 3 or case 4
var n int
first, n, err = parseQuotedName(header)
if err != nil {
return "", err
}

// git accepts multiple spaces after a quoted name, but not after an
// unquoted name, since the name might end with one or more spaces
for n < len(header) && isSpace(header[n]) {
n++
}
if n == len(header) {
return "", nil
}

if header[n] == '"' {
second, _, err = parseQuotedName(header[n:])
if err != nil {
return "", err
}
} else {
second = header[n:]
}
}

if firstName != secondName {
first = trimTreePrefix(first, 1)
if second != "" {
if first == trimTreePrefix(second, 1) {
return first, nil
}
return "", nil
}
return firstName, nil

// at this point, both names are unquoted (case 1)
// since names may contain spaces, we can't use a known separator
// instead, look for a split that produces two equal names

for i := 0; i < len(first)-1; i++ {
if !isSpace(first[i]) {
continue
}
second = trimTreePrefix(first[i+1:], 1)
if name := first[:i]; name == second {
return name, nil
}
}
return "", nil
}

// parseGitHeaderData parses a single line of metadata from a Git file header.
Expand Down Expand Up @@ -283,25 +347,25 @@ func parseGitHeaderCreatedMode(f *File, line, defaultName string) error {

func parseGitHeaderCopyFrom(f *File, line, defaultName string) (err error) {
f.IsCopy = true
f.OldName, _, err = parseName(line, -1, 0)
f.OldName, _, err = parseName(line, 0, 0)
return
}

func parseGitHeaderCopyTo(f *File, line, defaultName string) (err error) {
f.IsCopy = true
f.NewName, _, err = parseName(line, -1, 0)
f.NewName, _, err = parseName(line, 0, 0)
return
}

func parseGitHeaderRenameFrom(f *File, line, defaultName string) (err error) {
f.IsRename = true
f.OldName, _, err = parseName(line, -1, 0)
f.OldName, _, err = parseName(line, 0, 0)
return
}

func parseGitHeaderRenameTo(f *File, line, defaultName string) (err error) {
f.IsRename = true
f.NewName, _, err = parseName(line, -1, 0)
f.NewName, _, err = parseName(line, 0, 0)
return
}

Expand Down Expand Up @@ -349,14 +413,14 @@ func parseMode(s string) (os.FileMode, error) {

// parseName extracts a file name from the start of a string and returns the
// name and the index of the first character after the name. If the name is
// unquoted and term is non-negative, parsing stops at the first occurrence of
// term. Otherwise parsing of unquoted names stops at the first space or tab.
// unquoted and term is non-zero, parsing stops at the first occurrence of
// term.
//
// If the name is exactly "/dev/null", no further processing occurs. Otherwise,
// if dropPrefix is greater than zero, that number of prefix components
// separated by forward slashes are dropped from the name and any duplicate
// slashes are collapsed.
func parseName(s string, term rune, dropPrefix int) (name string, n int, err error) {
func parseName(s string, term byte, dropPrefix int) (name string, n int, err error) {
if len(s) > 0 && s[0] == '"' {
name, n, err = parseQuotedName(s)
} else {
Expand Down Expand Up @@ -387,15 +451,12 @@ func parseQuotedName(s string) (name string, n int, err error) {
return name, n, err
}

func parseUnquotedName(s string, term rune) (name string, n int, err error) {
func parseUnquotedName(s string, term byte) (name string, n int, err error) {
for n = 0; n < len(s); n++ {
if s[n] == '\n' {
break
}
if term >= 0 && rune(s[n]) == term {
break
}
if term < 0 && (s[n] == ' ' || s[n] == '\t') {
if term > 0 && s[n] == term {
break
}
}
Expand Down Expand Up @@ -440,6 +501,17 @@ func cleanName(name string, drop int) string {
return b.String()
}

// trimTreePrefix removes up to n leading directory components from name.
func trimTreePrefix(name string, n int) string {
i := 0
for ; i < len(name) && n > 0; i++ {
if name[i] == '/' {
n--
}
}
return name[i:]
}

// hasEpochTimestamp returns true if the string ends with a POSIX-formatted
// timestamp for the UNIX epoch after a tab character. According to git, this
// is used by GNU diff to mark creations and deletions.
Expand Down Expand Up @@ -468,3 +540,7 @@ func hasEpochTimestamp(s string) bool {
}
return true
}

func isSpace(c byte) bool {
return c == ' ' || c == '\t' || c == '\n'
}
43 changes: 35 additions & 8 deletions gitdiff/file_header_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ func TestCleanName(t *testing.T) {
func TestParseName(t *testing.T) {
tests := map[string]struct {
Input string
Term rune
Term byte
Drop int
Output string
N int
Expand All @@ -334,14 +334,17 @@ func TestParseName(t *testing.T) {
"dropPrefix": {
Input: "a/dir/file.txt", Drop: 1, Output: "dir/file.txt", N: 14,
},
"multipleNames": {
Input: "dir/a.txt dir/b.txt", Term: -1, Output: "dir/a.txt", N: 9,
"unquotedWithSpaces": {
Input: "dir/with spaces.txt", Output: "dir/with spaces.txt", N: 19,
},
"unquotedWithTrailingSpaces": {
Input: "dir/with spaces.space ", Output: "dir/with spaces.space ", N: 23,
},
"devNull": {
Input: "/dev/null", Term: '\t', Drop: 1, Output: "/dev/null", N: 9,
},
"newlineAlwaysSeparates": {
Input: "dir/file.txt\n", Term: 0, Output: "dir/file.txt", N: 12,
"newlineSeparates": {
Input: "dir/file.txt\n", Output: "dir/file.txt", N: 12,
},
"emptyString": {
Input: "", Err: true,
Expand Down Expand Up @@ -630,9 +633,33 @@ func TestParseGitHeaderName(t *testing.T) {
Input: "a/dir/foo.txt b/dir/bar.txt",
Output: "",
},
"missingSecondName": {
Input: "a/dir/foo.txt",
Err: true,
"matchingNamesWithSpaces": {
Input: "a/dir/file with spaces.txt b/dir/file with spaces.txt",
Output: "dir/file with spaces.txt",
},
"matchingNamesWithTrailingSpaces": {
Input: "a/dir/spaces b/dir/spaces ",
Output: "dir/spaces ",
},
"matchingNamesQuoted": {
Input: `"a/dir/\"quotes\".txt" "b/dir/\"quotes\".txt"`,
Output: `dir/"quotes".txt`,
},
"matchingNamesFirstQuoted": {
Input: `"a/dir/file.txt" b/dir/file.txt`,
Output: "dir/file.txt",
},
"matchingNamesSecondQuoted": {
Input: `a/dir/file.txt "b/dir/file.txt"`,
Output: "dir/file.txt",
},
"noSecondName": {
Input: "a/dir/foo.txt",
Output: "",
},
"noSecondNameQuoted": {
Input: `"a/dir/foo.txt"`,
Output: "",
},
"invalidName": {
Input: `"a/dir/file.txt b/dir/file.txt`,
Expand Down