From 915769a6d55ac19a8e7de0710fcbfb720d37f6a4 Mon Sep 17 00:00:00 2001 From: Billy Keyes Date: Sat, 1 Oct 2022 13:51:03 -0700 Subject: [PATCH 1/2] Add option to control patch subject cleaning When processing mail-formatted patches, the default cleanup removed all leading content in square brackets, but this pattern is often used to identify tickets or other information that should remain in the commit title. Git supports disabling this the the `-k` and `-b` flags, which we simulate with the new SubjectCleanMode options. Use WithSubjectCleanMode(SubjectCleanPatchOnly) to only remove bracketed strings that contain "PATCH", keeping others that are (probably) part of the actual commit message. Note that because of the mail parsing library, we cannot replicate the `-k` flag exactly and always clean leading and trailing whitespace. --- README.md | 4 + gitdiff/patch_header.go | 128 ++++++++++++++++---------- gitdiff/patch_header_test.go | 171 +++++++++++++++++++++++++---------- 3 files changed, 210 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 58bb675..01e8d2d 100644 --- a/README.md +++ b/README.md @@ -101,3 +101,7 @@ The parsing code has also had a modest amount of fuzz testing. context of each fragment must exactly match the source file; `git apply` implements a search algorithm that tries different lines and amounts of context, with further options to normalize or ignore whitespace changes. + +7. When parsing mail-formatted patch headers, leading and trailing whitespace + is always removed from `Subject` lines. There is no exact equivalent to `git + mailinfo -k`. diff --git a/gitdiff/patch_header.go b/gitdiff/patch_header.go index da71d7e..f935c6d 100644 --- a/gitdiff/patch_header.go +++ b/gitdiff/patch_header.go @@ -165,34 +165,71 @@ func ParsePatchDate(s string) (time.Time, error) { return time.Time{}, fmt.Errorf("unknown date format: %s", s) } -// ParsePatchHeader parses a preamble string as returned by Parse into a +// A PatchHeaderOption modifies the behavior of ParsePatchHeader. +type PatchHeaderOption func(*patchHeaderOptions) + +// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when +// parsing mail-formatted patches. +type SubjectCleanMode int + +const ( + // SubjectCleanWhitespace removes leading and trailing whitespace. + SubjectCleanWhitespace SubjectCleanMode = iota + + // SubjectCleanAll removes leading and trailing whitespace, leading "Re:", + // "re:", and ":" strings, and leading strings enclosed by '[' and ']'. + // This is the default behavior of git (see `git mailinfo`) and this + // package. + SubjectCleanAll + + // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes + // leading strings enclosed by '[' and ']' if they start with "PATCH". + SubjectCleanPatchOnly +) + +// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By +// default, uses SubjectCleanAll. +func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption { + return func(opts *patchHeaderOptions) { + opts.subjectCleanMode = m + } +} + +type patchHeaderOptions struct { + subjectCleanMode SubjectCleanMode +} + +// ParsePatchHeader parses the preamble string returned by [Parse] into a // PatchHeader. Due to the variety of header formats, some fields of the parsed // PatchHeader may be unset after parsing. // // Supported formats are the short, medium, full, fuller, and email pretty -// formats used by git diff, git log, and git show and the UNIX mailbox format -// used by git format-patch. +// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox +// format used by `git format-patch`. // -// If ParsePatchHeader detects that it is handling an email, it will -// remove extra content at the beginning of the title line, such as -// `[PATCH]` or `Re:` in the same way that `git mailinfo` does. -// SubjectPrefix will be set to the value of this removed string. -// (`git mailinfo` is the core part of `git am` that pulls information -// out of an individual mail.) +// When parsing mail-formatted headers, ParsePatchHeader tries to remove +// email-specific content from the title and body: // -// Additionally, if ParsePatchHeader detects that it's handling an -// email, it will remove a `---` line and put anything after it into -// BodyAppendix. +// - Based on the SubjectCleanMode, remove prefixes like reply markers and +// "[PATCH]" strings from the subject, saving any removed content in the +// SubjectPrefix field. Parsing always discards leading and trailing +// whitespace from the subject line. The default mode is SubjectCleanAll. // -// Those wishing the effect of a plain `git am` should use -// `PatchHeader.Title + "\n" + PatchHeader.Body` (or -// `PatchHeader.Message()`). Those wishing to retain the subject -// prefix and appendix material should use `PatchHeader.SubjectPrefix -// + PatchHeader.Title + "\n" + PatchHeader.Body + "\n" + -// PatchHeader.BodyAppendix`. -func ParsePatchHeader(header string) (*PatchHeader, error) { - header = strings.TrimSpace(header) +// - If the body contains a "---" line (3 hyphens), remove that line and any +// content after it from the body and save it in the BodyAppendix field. +// +// ParsePatchHeader tries to process content it does not understand wthout +// returning errors, but will return errors if well-identified content like +// dates or identies uses unknown or invalid formats. +func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) { + opts := patchHeaderOptions{ + subjectCleanMode: SubjectCleanAll, // match git defaults + } + for _, optFn := range options { + optFn(&opts) + } + header = strings.TrimSpace(header) if header == "" { return &PatchHeader{}, nil } @@ -208,12 +245,12 @@ func ParsePatchHeader(header string) (*PatchHeader, error) { switch { case strings.HasPrefix(firstLine, mailHeaderPrefix): - return parseHeaderMail(firstLine, strings.NewReader(rest)) + return parseHeaderMail(firstLine, strings.NewReader(rest), opts) case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix): // With a minimum header, the first line is part of the actual mail // content and needs to be parsed as part of the "rest" - return parseHeaderMail("", strings.NewReader(header)) + return parseHeaderMail("", strings.NewReader(header), opts) case strings.HasPrefix(firstLine, prettyHeaderPrefix): return parseHeaderPretty(firstLine, strings.NewReader(rest)) @@ -366,7 +403,7 @@ func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (st return body.String(), appendix.String() } -func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { +func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) { msg, err := mail.ReadMessage(r) if err != nil { return nil, err @@ -403,7 +440,7 @@ func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { } subject := msg.Header.Get("Subject") - h.SubjectPrefix, h.Title = parseSubject(subject) + h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode) s := bufio.NewScanner(msg.Body) h.Body, h.BodyAppendix = scanMessageBody(s, "", true) @@ -414,23 +451,24 @@ func parseHeaderMail(mailLine string, r io.Reader) (*PatchHeader, error) { return h, nil } -// Takes an email subject and returns the patch prefix and commit -// title. i.e., `[PATCH v3 3/5] Implement foo` would return `[PATCH -// v3 3/5] ` and `Implement foo` -func parseSubject(s string) (string, string) { - // This is meant to be compatible with - // https://github.com/git/git/blob/master/mailinfo.c:cleanup_subject(). - // If compatibility with `git am` drifts, go there to see if there - // are any updates. +func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) { + switch mode { + case SubjectCleanAll, SubjectCleanPatchOnly: + case SubjectCleanWhitespace: + return "", strings.TrimSpace(decodeSubject(s)) + default: + panic(fmt.Sprintf("unknown clean mode: %d", mode)) + } + + // Based on the algorithm from Git in mailinfo.c:cleanup_subject() + // If compatibility with `git am` drifts, go there to see if there are any updates. at := 0 for at < len(s) { switch s[at] { case 'r', 'R': // Detect re:, Re:, rE: and RE: - if at+2 < len(s) && - (s[at+1] == 'e' || s[at+1] == 'E') && - s[at+2] == ':' { + if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' { at += 3 continue } @@ -441,25 +479,21 @@ func parseSubject(s string) (string, string) { continue case '[': - // Look for closing parenthesis - j := at + 1 - for ; j < len(s); j++ { - if s[j] == ']' { - break + if i := strings.IndexByte(s[at:], ']'); i > 0 { + if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") { + at += i + 1 + continue } } - - if j < len(s) { - at = j + 1 - continue - } } - // Only loop if we actually removed something + // Nothing was removed, end processing break } - return s[:at], decodeSubject(s[at:]) + prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace) + subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace) + return } // Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go index 57207a5..e940be2 100644 --- a/gitdiff/patch_header_test.go +++ b/gitdiff/patch_header_test.go @@ -144,9 +144,10 @@ func TestParsePatchHeader(t *testing.T) { expectedBodyAppendix := "CC: Joe Smith " tests := map[string]struct { - Input string - Header PatchHeader - Err interface{} + Input string + Options []PatchHeaderOption + Header PatchHeader + Err interface{} }{ "prettyShort": { Input: `commit 61f5cd90bed4d204ee3feb3aa41ee91d4734855b @@ -414,7 +415,7 @@ Author: Morton Haypenny Title: expectedTitle, }, }, - "empty": { + "emptyHeader": { Input: "", Header: PatchHeader{}, }, @@ -422,7 +423,7 @@ Author: Morton Haypenny for name, test := range tests { t.Run(name, func(t *testing.T) { - h, err := ParsePatchHeader(test.Input) + h, err := ParsePatchHeader(test.Input, test.Options...) if test.Err != nil { assertError(t, test.Err, err, "parsing patch header") return @@ -477,50 +478,128 @@ func assertPatchIdentity(t *testing.T, kind string, exp, act *PatchIdentity) { } } -func TestCleanupSubject(t *testing.T) { - exp := "A sample commit to test header parsing" - tests := map[string]string{ - "plain": "", - "patch": "[PATCH] ", - "patchv5": "[PATCH v5] ", - "patchrfc": "[PATCH RFC] ", - "patchnospace": "[PATCH]", - "space": " ", - "re": "re: ", - "Re": "Re: ", - "RE": "rE: ", - "rere": "re: re: ", - } - - for name, prefix := range tests { - gotprefix, gottitle := parseSubject(prefix + exp) - if gottitle != exp { - t.Errorf("%s: Incorrect parsing of prefix %s: got title %s, wanted %s", - name, prefix, gottitle, exp) - } - if gotprefix != prefix { - t.Errorf("%s: Incorrect parsing of prefix %s: got prefix %s", - name, prefix, gotprefix) - } - } +func TestCleanSubject(t *testing.T) { + expectedSubject := "A sample commit to test header parsing" - moretests := map[string]struct { - in, eprefix, etitle string + tests := map[string]struct { + Input string + Mode SubjectCleanMode + Prefix string + Subject string }{ - "Reimplement": {"Reimplement something", "", "Reimplement something"}, - "patch-reimplement": {"[PATCH v5] Reimplement something", "[PATCH v5] ", "Reimplement something"}, - "Openbracket": {"[Just to annoy people", "", "[Just to annoy people"}, + "CleanAll/noPrefix": { + Input: expectedSubject, + Mode: SubjectCleanAll, + Subject: expectedSubject, + }, + "CleanAll/patchPrefix": { + Input: "[PATCH] " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "[PATCH] ", + Subject: expectedSubject, + }, + "CleanAll/patchPrefixNoSpace": { + Input: "[PATCH]" + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "[PATCH]", + Subject: expectedSubject, + }, + "CleanAll/patchPrefixContent": { + Input: "[PATCH 3/7] " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "[PATCH 3/7] ", + Subject: expectedSubject, + }, + "CleanAll/spacePrefix": { + Input: " " + expectedSubject, + Mode: SubjectCleanAll, + Subject: expectedSubject, + }, + "CleanAll/replyLowerPrefix": { + Input: "re: " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "re: ", + Subject: expectedSubject, + }, + "CleanAll/replyMixedPrefix": { + Input: "Re: " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "Re: ", + Subject: expectedSubject, + }, + "CleanAll/replyCapsPrefix": { + Input: "RE: " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "RE: ", + Subject: expectedSubject, + }, + "CleanAll/replyDoublePrefix": { + Input: "Re: re: " + expectedSubject, + Mode: SubjectCleanAll, + Prefix: "Re: re: ", + Subject: expectedSubject, + }, + "CleanAll/noPrefixSubjectHasRe": { + Input: "Reimplement parsing", + Mode: SubjectCleanAll, + Subject: "Reimplement parsing", + }, + "CleanAll/patchPrefixSubjectHasRe": { + Input: "[PATCH 1/2] Reimplement parsing", + Mode: SubjectCleanAll, + Prefix: "[PATCH 1/2] ", + Subject: "Reimplement parsing", + }, + "CleanAll/unclosedPrefix": { + Input: "[Just to annoy people", + Mode: SubjectCleanAll, + Subject: "[Just to annoy people", + }, + "CleanAll/multiplePrefix": { + Input: " Re:Re: [PATCH 1/2][DRAFT] " + expectedSubject + " ", + Mode: SubjectCleanAll, + Prefix: "Re:Re: [PATCH 1/2][DRAFT] ", + Subject: expectedSubject, + }, + "CleanPatchOnly/patchPrefix": { + Input: "[PATCH] " + expectedSubject, + Mode: SubjectCleanPatchOnly, + Prefix: "[PATCH] ", + Subject: expectedSubject, + }, + "CleanPatchOnly/mixedPrefix": { + Input: "[PATCH] [TICKET-123] " + expectedSubject, + Mode: SubjectCleanPatchOnly, + Prefix: "[PATCH] ", + Subject: "[TICKET-123] " + expectedSubject, + }, + "CleanPatchOnly/multiplePrefix": { + Input: "Re:Re: [PATCH 1/2][DRAFT] " + expectedSubject, + Mode: SubjectCleanPatchOnly, + Prefix: "Re:Re: [PATCH 1/2]", + Subject: "[DRAFT] " + expectedSubject, + }, + "CleanWhitespace/leadingSpace": { + Input: " [PATCH] " + expectedSubject, + Mode: SubjectCleanWhitespace, + Subject: "[PATCH] " + expectedSubject, + }, + "CleanWhitespace/trailingSpace": { + Input: "[PATCH] " + expectedSubject + " ", + Mode: SubjectCleanWhitespace, + Subject: "[PATCH] " + expectedSubject, + }, } - for name, test := range moretests { - prefix, title := parseSubject(test.in) - if title != test.etitle { - t.Errorf("%s: Incorrect parsing of %s: got title %s, wanted %s", - name, test.in, title, test.etitle) - } - if prefix != test.eprefix { - t.Errorf("%s: Incorrect parsing of %s: got prefix %s, wanted %s", - name, test.in, title, test.etitle) - } + for name, test := range tests { + t.Run(name, func(t *testing.T) { + prefix, subject := cleanSubject(test.Input, test.Mode) + if prefix != test.Prefix { + t.Errorf("incorrect prefix: expected %q, actual %q", test.Prefix, prefix) + } + if subject != test.Subject { + t.Errorf("incorrect subject: expected %q, actual %q", test.Subject, subject) + } + }) } } From 31a910ee5b4d667bf48a97d27ce60b1705ffb6e1 Mon Sep 17 00:00:00 2001 From: Billy Keyes Date: Sat, 1 Oct 2022 14:44:52 -0700 Subject: [PATCH 2/2] Test passing options to top-level func --- gitdiff/patch_header_test.go | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go index e940be2..003e8c7 100644 --- a/gitdiff/patch_header_test.go +++ b/gitdiff/patch_header_test.go @@ -270,6 +270,28 @@ Another body line. Body: expectedBody, }, }, + "mailboxPatchOnly": { + Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001 +From: Morton Haypenny +Date: Sat, 11 Apr 2020 15:21:23 -0700 +Subject: [PATCH] [BUG-123] A sample commit to test header parsing + +The medium format shows the body, which +may wrap on to multiple lines. + +Another body line. +`, + Options: []PatchHeaderOption{ + WithSubjectCleanMode(SubjectCleanPatchOnly), + }, + Header: PatchHeader{ + SHA: expectedSHA, + Author: expectedIdentity, + AuthorDate: expectedDate, + Title: "[BUG-123] " + expectedTitle, + Body: expectedBody, + }, + }, "mailboxEmojiOneLine": { Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001 From: Morton Haypenny