Skip to content

Commit a00d2cc

Browse files
authored
Follow git logic when parsing patch identities (#44)
When GitHub creates patches for Dependabot PRs, it generates a "From:" line that is not valid according to RFC 5322: the address spec contains unquoted special characters (the "[bot]" in "dependabot[bot]"). While the 'net/mail' parser makes some exceptions to the spec, this is not one of them, so parsing these patch headers fails. Git's 'mailinfo' command avoids this by only implementing the unquoting part of RFC 5322 and then applying a heuristic to separate the string in to name and email values that seem reasonable. This commit does two things: 1. Reimplements ParsePatchIdentity to follow Git's logic, so that it can accept a wider range of inputs, including quoted strings. Strings accepted by the previous implementation parse in the same way with one exception: inputs that contain whitespace inside the angle brackets for an email address now use the email address as the name and drop any separate name component. 2. When parsing mail-formatted patches, use ParsePatchIdentity to parse the "From:" line instead of the 'net/mail' function.
1 parent 3f2ea5c commit a00d2cc

File tree

4 files changed

+321
-142
lines changed

4 files changed

+321
-142
lines changed

gitdiff/patch_header.go

Lines changed: 6 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -68,62 +68,6 @@ func (h *PatchHeader) Message() string {
6868
return msg.String()
6969
}
7070

71-
// PatchIdentity identifies a person who authored or committed a patch.
72-
type PatchIdentity struct {
73-
Name string
74-
Email string
75-
}
76-
77-
func (i PatchIdentity) String() string {
78-
name := i.Name
79-
if name == "" {
80-
name = `""`
81-
}
82-
return fmt.Sprintf("%s <%s>", name, i.Email)
83-
}
84-
85-
// ParsePatchIdentity parses a patch identity string. A valid string contains
86-
// an optional name followed by an email address in angle brackets. The angle
87-
// brackets must always exist, but may enclose an empty address. At least one
88-
// of the name or the email address must be non-empty. If the string only
89-
// contains an email address, that value is also used as the name.
90-
//
91-
// The name must not contain a left angle bracket, '<', and the email address
92-
// must not contain a right angle bracket, '>'. Otherwise, there are no
93-
// restrictions on the format of either field.
94-
func ParsePatchIdentity(s string) (PatchIdentity, error) {
95-
var emailStart, emailEnd int
96-
for i, c := range s {
97-
if c == '<' && emailStart == 0 {
98-
emailStart = i + 1
99-
}
100-
if c == '>' && emailStart > 0 {
101-
emailEnd = i
102-
break
103-
}
104-
}
105-
if emailStart > 0 && emailEnd == 0 {
106-
return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s)
107-
}
108-
109-
var name, email string
110-
if emailStart > 0 {
111-
name = strings.TrimSpace(s[:emailStart-1])
112-
}
113-
if emailStart > 0 && emailEnd > 0 {
114-
email = strings.TrimSpace(s[emailStart:emailEnd])
115-
}
116-
if name == "" && email != "" {
117-
name = email
118-
}
119-
120-
if name == "" && email == "" {
121-
return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s)
122-
}
123-
124-
return PatchIdentity{Name: name, Email: email}, nil
125-
}
126-
12771
// ParsePatchDate parses a patch date string. It returns the parsed time or an
12872
// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
12973
// short, raw, unix, and default formats (with local variants) used by the
@@ -425,16 +369,13 @@ func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*Pa
425369
}
426370
}
427371

428-
addrs, err := msg.Header.AddressList("From")
429-
if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) {
430-
return nil, err
431-
}
432-
if len(addrs) > 0 {
433-
addr := addrs[0]
434-
if addr.Name == "" {
435-
addr.Name = addr.Address
372+
from := msg.Header.Get("From")
373+
if from != "" {
374+
u, err := ParsePatchIdentity(from)
375+
if err != nil {
376+
return nil, err
436377
}
437-
h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address}
378+
h.Author = &u
438379
}
439380

440381
date := msg.Header.Get("Date")

gitdiff/patch_header_test.go

Lines changed: 22 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -5,83 +5,6 @@ import (
55
"time"
66
)
77

8-
func TestParsePatchIdentity(t *testing.T) {
9-
tests := map[string]struct {
10-
Input string
11-
Output PatchIdentity
12-
Err interface{}
13-
}{
14-
"simple": {
15-
Input: "Morton Haypenny <[email protected]>",
16-
Output: PatchIdentity{
17-
Name: "Morton Haypenny",
18-
19-
},
20-
},
21-
"extraWhitespace": {
22-
Input: " Morton Haypenny <[email protected] > ",
23-
Output: PatchIdentity{
24-
Name: "Morton Haypenny",
25-
26-
},
27-
},
28-
"trailingCharacters": {
29-
Input: "Morton Haypenny <[email protected]> unrelated garbage",
30-
Output: PatchIdentity{
31-
Name: "Morton Haypenny",
32-
33-
},
34-
},
35-
"onlyEmail": {
36-
Input: "<[email protected]>",
37-
Output: PatchIdentity{
38-
39-
40-
},
41-
},
42-
"emptyEmail": {
43-
Input: "Morton Haypenny <>",
44-
Output: PatchIdentity{
45-
Name: "Morton Haypenny",
46-
Email: "",
47-
},
48-
},
49-
"missingEmail": {
50-
Input: "Morton Haypenny",
51-
Err: "invalid identity",
52-
},
53-
"missingNameAndEmptyEmail": {
54-
Input: "<>",
55-
Err: "invalid identity",
56-
},
57-
"empty": {
58-
Input: "",
59-
Err: "invalid identity",
60-
},
61-
"unclosedEmail": {
62-
Input: "Morton Haypenny <[email protected]",
63-
Err: "unclosed email",
64-
},
65-
}
66-
67-
for name, test := range tests {
68-
t.Run(name, func(t *testing.T) {
69-
id, err := ParsePatchIdentity(test.Input)
70-
if test.Err != nil {
71-
assertError(t, test.Err, err, "parsing identity")
72-
return
73-
}
74-
if err != nil {
75-
t.Fatalf("unexpected error parsing identity: %v", err)
76-
}
77-
78-
if test.Output != id {
79-
t.Errorf("incorrect identity: expected %#v, actual %#v", test.Output, id)
80-
}
81-
})
82-
}
83-
}
84-
858
func TestParsePatchDate(t *testing.T) {
869
expected := time.Date(2020, 4, 9, 8, 7, 6, 0, time.UTC)
8710

@@ -349,6 +272,28 @@ Another body line.
349272
Body: expectedBody,
350273
},
351274
},
275+
"mailboxRFC5322SpecialCharacters": {
276+
Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
277+
From: "dependabot[bot]" <12345+dependabot[bot]@users.noreply.github.com>
278+
Date: Sat, 11 Apr 2020 15:21:23 -0700
279+
Subject: [PATCH] A sample commit to test header parsing
280+
281+
The medium format shows the body, which
282+
may wrap on to multiple lines.
283+
284+
Another body line.
285+
`,
286+
Header: PatchHeader{
287+
SHA: expectedSHA,
288+
Author: &PatchIdentity{
289+
Name: "dependabot[bot]",
290+
Email: "12345+dependabot[bot]@users.noreply.github.com",
291+
},
292+
AuthorDate: expectedDate,
293+
Title: expectedTitle,
294+
Body: expectedBody,
295+
},
296+
},
352297
"mailboxAppendix": {
353298
Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001
354299
From: Morton Haypenny <[email protected]>

gitdiff/patch_identity.go

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
package gitdiff
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
)
7+
8+
// PatchIdentity identifies a person who authored or committed a patch.
9+
type PatchIdentity struct {
10+
Name string
11+
Email string
12+
}
13+
14+
func (i PatchIdentity) String() string {
15+
name := i.Name
16+
if name == "" {
17+
name = `""`
18+
}
19+
return fmt.Sprintf("%s <%s>", name, i.Email)
20+
}
21+
22+
// ParsePatchIdentity parses a patch identity string. A patch identity contains
23+
// an email address and an optional name in [RFC 5322] format. This is either a
24+
// plain email adddress or a name followed by an address in angle brackets:
25+
//
26+
27+
// Author Name <[email protected]>
28+
//
29+
// If the input is not one of these formats, ParsePatchIdentity applies a
30+
// heuristic to separate the name and email portions. If both the name and
31+
// email are missing or empty, ParsePatchIdentity returns an error. It
32+
// otherwise does not validate the result.
33+
//
34+
// [RFC 5322]: https://datatracker.ietf.org/doc/html/rfc5322
35+
func ParsePatchIdentity(s string) (PatchIdentity, error) {
36+
s = normalizeSpace(s)
37+
s = unquotePairs(s)
38+
39+
var name, email string
40+
if at := strings.IndexByte(s, '@'); at >= 0 {
41+
start, end := at, at
42+
for start >= 0 && !isRFC5332Space(s[start]) && s[start] != '<' {
43+
start--
44+
}
45+
for end < len(s) && !isRFC5332Space(s[end]) && s[end] != '>' {
46+
end++
47+
}
48+
email = s[start+1 : end]
49+
50+
// Adjust the boundaries so that we drop angle brackets, but keep
51+
// spaces when removing the email to form the name.
52+
if start < 0 || s[start] != '<' {
53+
start++
54+
}
55+
if end >= len(s) || s[end] != '>' {
56+
end--
57+
}
58+
name = s[:start] + s[end+1:]
59+
} else {
60+
start, end := 0, 0
61+
for i := 0; i < len(s); i++ {
62+
if s[i] == '<' && start == 0 {
63+
start = i + 1
64+
}
65+
if s[i] == '>' && start > 0 {
66+
end = i
67+
break
68+
}
69+
}
70+
if start > 0 && end >= start {
71+
email = strings.TrimSpace(s[start:end])
72+
name = s[:start-1]
73+
}
74+
}
75+
76+
// After extracting the email, the name might contain extra whitespace
77+
// again and may be surrounded by comment characters. The git source gives
78+
// these examples of when this can happen:
79+
//
80+
// "Name <email@domain>"
81+
// "email@domain (Name)"
82+
// "Name <email@domain> (Comment)"
83+
//
84+
name = normalizeSpace(name)
85+
if strings.HasPrefix(name, "(") && strings.HasSuffix(name, ")") {
86+
name = name[1 : len(name)-1]
87+
}
88+
name = strings.TrimSpace(name)
89+
90+
// If the name is empty or contains email-like characters, use the email
91+
// instead (assuming one exists)
92+
if name == "" || strings.ContainsAny(name, "@<>") {
93+
name = email
94+
}
95+
96+
if name == "" && email == "" {
97+
return PatchIdentity{}, fmt.Errorf("invalid identity string %q", s)
98+
}
99+
return PatchIdentity{Name: name, Email: email}, nil
100+
}
101+
102+
// unquotePairs process the RFC5322 tokens "quoted-string" and "comment" to
103+
// remove any "quoted-pairs" (backslash-espaced characters). It also removes
104+
// the quotes from any quoted strings, but leaves the comment delimiters.
105+
func unquotePairs(s string) string {
106+
quote := false
107+
comments := 0
108+
escaped := false
109+
110+
var out strings.Builder
111+
for i := 0; i < len(s); i++ {
112+
if escaped {
113+
escaped = false
114+
} else {
115+
switch s[i] {
116+
case '\\':
117+
// quoted-pair is only allowed in quoted-string/comment
118+
if quote || comments > 0 {
119+
escaped = true
120+
continue // drop '\' character
121+
}
122+
123+
case '"':
124+
if comments == 0 {
125+
quote = !quote
126+
continue // drop '"' character
127+
}
128+
129+
case '(':
130+
if !quote {
131+
comments++
132+
}
133+
case ')':
134+
if comments > 0 {
135+
comments--
136+
}
137+
}
138+
}
139+
out.WriteByte(s[i])
140+
}
141+
return out.String()
142+
}
143+
144+
// normalizeSpace trims leading and trailing whitespace from s and converts
145+
// inner sequences of one or more whitespace characters to single spaces.
146+
func normalizeSpace(s string) string {
147+
var sb strings.Builder
148+
for i := 0; i < len(s); i++ {
149+
c := s[i]
150+
if !isRFC5332Space(c) {
151+
if sb.Len() > 0 && isRFC5332Space(s[i-1]) {
152+
sb.WriteByte(' ')
153+
}
154+
sb.WriteByte(c)
155+
}
156+
}
157+
return sb.String()
158+
}
159+
160+
func isRFC5332Space(c byte) bool {
161+
switch c {
162+
case '\t', '\n', '\r', ' ':
163+
return true
164+
}
165+
return false
166+
}

0 commit comments

Comments
 (0)